def Specialize(self, atomizer: atomizers.AtomizerBase) -> None: """Specialize a termination criteria to a vocabulary. This enables the termination criteria to set state specialized to a specific encoding vocabulary. This is guaranteed to be called before SampleIsComplete(), and ensures that the vocabulary used for all sample arguments to SampleIsComplete() is from this vocabulary. Args: atomizer: An atomizer to specialize to. Raises: InvalidSymtokTokens: If the depth tokens can't be encoded, or they encode to more than one token. """ try: left = atomizer.AtomizeString(self.left_token) right = atomizer.AtomizeString(self.right_token) if len(left) > 1 or len(right) > 1: raise errors.InvalidSymtokTokens( "Sampler symmetrical depth tokens do not encode to a single " "token using the corpus vocabulary") except errors.VocabError: raise errors.InvalidSymtokTokens( "Sampler symmetrical depth tokens cannot be encoded using the " "corpus vocabulary")
def Specialize(self, atomizer: atomizers.AtomizerBase) -> None: """Specialize a sampler a vocabulary. This enables the sampler to set state specialized to a specific encoding vocabulary. This is guaranteed to be called before SampleIsComplete(), and ensures that the vocabulary used for all sample arguments to SampleIsComplete() is from this vocabulary. Args: atomizer: An atomizer to specialize to. Raises: InvalidStartText: If the start_text cannot be encoded using the vocabulary. UserError: In case the sampler cannot be specialized to this vocabulary. """ try: self.encoded_start_text = atomizer.AtomizeString(self.start_text) self.tokenized_start_text = atomizer.TokenizeString( self.start_text) except errors.VocabError: raise errors.InvalidStartText( "Sampler start text cannot be encoded using the corpus vocabulary: " f"'{self.start_text}'") if len(self.encoded_start_text) >= self.sequence_length: raise errors.InvalidStartText( "Encoded sampler start text must be less than sampler sequence " f"length. Sampler sequence length={self.sequence_length}, encoded " f"start text length={len(self.encoded_start_text)}") [terminator.Specialize(atomizer) for terminator in self.terminators]
def FromPreprocessed( cls, preprocessed_cf: preprocessed.PreprocessedContentFile, atomizer: atomizers.AtomizerBase, eof: str, ) -> "EncodedContentFile": """Instantiate an EncodedContentFile from a preprocessed file. Args: preprocessed_cf: A PreprocessedContentFile instance. atomizer: The atomizer to encode using. eof: An end-of-file marker which is concatenated to the encoded sequence. Returns: An EncodedContentFile instance. """ start_time = time.time() data = atomizer.AtomizeString(preprocessed_cf.text) encoding_time_ms = int((time.time() - start_time) * 1000) return EncodedContentFile( id=preprocessed_cf.id, # Encode the end-of-file marker separately to ensure that it resolves to # the correct token. For example if the vocabulary contains 'a', 'b', # and 'ab', then a content file 'a' with EOF marker 'b' would be encoded # as 'ab', instead of 'a'+'b'. data=cls.NumpyArrayToDataString( np.concatenate((data, atomizer.AtomizeString(eof)))), tokencount=len(data), encoding_time_ms=encoding_time_ms, wall_time_ms=encoding_time_ms, # The outer-loop may change this. date_added=datetime.datetime.utcnow(), )
def EncodeAndPad(srcs: typing.List[str], padded_length: int, atomizer: atomizers.AtomizerBase) -> np.array: """Encode and pad source code strings for training.""" seqs = [atomizer.AtomizeString(src) for src in srcs] pad_val = atomizer.vocab_size encoded = np.array( sequence.pad_sequences(seqs, maxlen=padded_length, value=pad_val)) return np.vstack([np.expand_dims(x, axis=0) for x in encoded])
def EncodeAndPadSources(atomizer: atomizers.AtomizerBase, srcs: typing.List[str], maxlen: int) -> np.array: """Encode and pad source code for learning.""" seqs = [atomizer.AtomizeString(src) for src in srcs] pad_val = atomizer.vocab_size encoded = np.array( keras_sequence.pad_sequences(seqs, maxlen=maxlen, value=pad_val)) return np.vstack([np.expand_dims(x, axis=0) for x in encoded])
def MakeProgram( self, sampled_tokens: typing.List[str], backtracker: OpenClBacktrackingHelper, atomizer: atomizers.AtomizerBase, ) -> typing.List[str]: """Produce a kernel from a sample.""" src = backtracker.TryToCloseProgram(sampled_tokens) or "" return atomizer.TokenizeString(src)
def SampleOneWithBacktrackingToTextStream( self, sampler: samplers.Sampler, atomizer: atomizers.AtomizerBase, backtracker: OpenClBacktrackingHelper, ) -> typing.List[str]: """Produce a single sample using backtracking. Args: sampler: A Sampler instance, used to determine the start text, and when to terminate sampling. atomizer: The corpus vocabulary atomizer. backtracker: An instance of the backtracking helper class. Returns: A sample, as a sequence of strings. """ data = { "sample_in_progress": "", "candidate": "", "status": "running", "elapsed": 0, } def Data(data: Dict[str, Any]): data["elapsed"] = humanize.Duration(time.time() - start_time) return f"retry: 100\ndata: {json.dumps(data)}\n\n" start_time = time.time() # During sampling, 'sample_in_progress' contains the sequence of tokens that # is restored when backtracking. sample_in_progress = sampler.tokenized_start_text.copy() rollback_state, rollback_index = self.backend.EvaluateSampleState( sampler) data["sample_in_progress"] = "".join(sample_in_progress) yield Data(data) # Generate a batch of candidates. for step_count in range( 1, FLAGS.experimental_clgen_backtracking_max_steps + 1): self._logger.OnSampleStep(backtracker, 0, len(sample_in_progress)) data["sample_in_progress"] = "".join(sample_in_progress) data["candidate"] = "" data["status"] = f"step {step_count}" app.Log(4, "Current sample in progress: `%s`", "".join(sample_in_progress)) yield Data(data) # Generate a batch of candidates statements to choose from. candidate_statements = [] for _ in range(FLAGS.experimental_clgen_backtracking_max_attempts): yield Data(data) candidate_statements.extend( self.TryToGenerateCandidateStatements( sample_in_progress, rollback_state, rollback_index, backtracker, sampler, atomizer, )) if candidate_statements: data["candidate"] = html.escape("".join( candidate_statements[-1].statement)) yield Data(data) candidate_statements = [ c for c in candidate_statements if c.feature_distance is not None ] if (len(candidate_statements) >= FLAGS. experimental_clgen_backtracking_candidates_per_step): break if not candidate_statements: app.Log( 2, "Failed to produce any candidate statement after %d attempts", FLAGS.experimental_clgen_backtracking_max_attempts, ) break # Select the best candidate. best_candidate = min(candidate_statements, key=lambda x: x.feature_distance) # Select a candidate using stochastic hill climbing old = backtracker.feature_distance deltas = [(old - c.feature_distance) for c in candidate_statements] deltas = np.array(deltas) + 0.5 deltas[deltas < 0.5] = 0.1 deltas = deltas / np.sum(deltas) sel_candidate_idx = np.random.choice(len(candidate_statements), p=deltas) sel_candidate = candidate_statements[sel_candidate_idx] app.Log( 2, "Selected best feature distance (%f) at step %d from candidates: %s", sel_candidate.feature_distance, step_count, SummarizeFloats(c.feature_distance for c in candidate_statements), ) app.Log(4, "Selected best statement: %s", "".join(sel_candidate.statement)) if backtracker.feature_distance - sel_candidate.feature_distance <= 0: stagnation += 1 if stagnation > 10: ( backtracker, sample_in_progress, rollback_state, rollback_index, ) = rollback_history.pop() stagnation = 0 app.Log(4, "Got Stuck. Backtracking") continue else: stagnation = 0 if step_count % 10 == 0: rollback_history.append(( copy.deepcopy(backtracker), list(sample_in_progress), rollback_state, rollback_index, )) # Set the sampler's rollback state to be the state produced by feeding # the best candidate in the input, so that future samples start from # the right state if len(sel_candidate.statement) > 0: sample_in_progress += sel_candidate.statement encoded_sel_candidate = atomizer.AtomizeString("".join( sel_candidate.statement)) arr = np.concatenate([rollback_index, encoded_sel_candidate]) self.backend.ResetSampleState(sampler, state=rollback_state, seed=arr) rollback_state, rollback_index = self.backend.EvaluateSampleState( sampler) app.Log( 5, "Current sample at step %d: %s", step_count, "".join(sample_in_progress), ) if backtracker.IsDone(sample_in_progress): app.Log(2, "Backtracking complete after %d steps", step_count) break else: app.Log(2, "Backtracking failed to complete after %d steps", step_count) yield Data(data)
def SampleOneWithBacktracking( self, sampler: samplers.Sampler, atomizer: atomizers.AtomizerBase, backtracker: OpenClBacktrackingHelper, ) -> typing.List[str]: """Produce a single sample using backtracking. Args: sampler: A Sampler instance, used to determine the start text, and when to terminate sampling. atomizer: The corpus vocabulary atomizer. backtracker: An instance of the backtracking helper class. Returns: A sample, as a sequence of strings. """ # During sampling, 'sample_in_progress' contains the sequence of tokens that # is restored when backtracking. sample_in_progress = sampler.tokenized_start_text.copy() self.backend.RandomizeSampleState() rollback_state, rollback_index = self.backend.EvaluateSampleState( sampler) rollback_history = [( copy.deepcopy(backtracker), list(sample_in_progress), rollback_state, rollback_index, )] stagnation = 0 # Generate a batch of candidates. for step_count in range( 1, FLAGS.experimental_clgen_backtracking_max_steps + 1): self._logger.OnSampleStep(backtracker, 0, len(sample_in_progress)) app.Log(4, "Current sample in progress: `%s`", "".join(sample_in_progress)) # Generate a batch of candidates statements to choose from. candidate_statements = [] for _ in range( FLAGS.experimental_clgen_backtracking_max_attempts * FLAGS.experimental_clgen_backtracking_candidates_per_step): candidate_statements.extend([ c for c in self.TryToGenerateCandidateStatements( sample_in_progress, rollback_state, rollback_index, backtracker, sampler, atomizer, ) if c.feature_distance is not None ]) if (len(candidate_statements) >= FLAGS. experimental_clgen_backtracking_candidates_per_step): break if not candidate_statements: app.Log( 2, "Failed to produce any candidate statement after %d attempts", FLAGS.experimental_clgen_backtracking_max_attempts, ) break # Select the best candidate. if self._target_features is not None: best_candidate = min(candidate_statements, key=lambda x: x.feature_distance) else: best_candidate = random.choice(candidate_statements) app.Log( 2, "Selected best feature distance (%f) at step %d from candidates: %s", best_candidate.feature_distance, step_count, SummarizeFloats(c.feature_distance for c in candidate_statements), ) app.Log(4, "Selected best statement: %s", "".join(best_candidate.statement)) # Set the sampler's rollback state to be the state produced by feeding # the best candidate in the input, so that future samples start from # the right state if len(best_candidate.statement) > 0: sample_in_progress += best_candidate.statement encoded_best_candidate = atomizer.AtomizeString("".join( best_candidate.statement)) arr = np.concatenate([rollback_index, encoded_best_candidate]) self.backend.ResetSampleState(sampler, state=rollback_state, seed=arr) rollback_state, rollback_index = self.backend.EvaluateSampleState( sampler) app.Log( 5, "Current sample at step %d: %s", step_count, "".join(sample_in_progress), ) if backtracker.IsDone(sample_in_progress): app.Log(2, "Backtracking complete after %d steps", step_count) break else: app.Log(2, "Backtracking failed to complete after %d steps", step_count) return self.MakeProgram(sample_in_progress, backtracker, atomizer)