def get_serialized_info(self): """Return the tf-example features, as encoded for storage on disk.""" info = self.get_tensor_info() # TODO(dbieber): See if you can switch to something general purpose like: # return { # key: tfds.features.TensorInfo(shape=(None,), dtype=value.dtype) # for key, value in info.items() # } info['start_index'] = tfds.features.TensorInfo(shape=(1, ), dtype=tf.int32) info['exit_index'] = tfds.features.TensorInfo(shape=(1, ), dtype=tf.int32) info['count'] = tfds.features.TensorInfo(shape=(1, ), dtype=tf.int32) info['num_nodes'] = tfds.features.TensorInfo(shape=(1, ), dtype=tf.int64) info['steps'] = tfds.features.TensorInfo(shape=(1, ), dtype=tf.int64) if control_flow_programs_version.at_least('0.0.44'): info['max_indent'] = tfds.features.TensorInfo(shape=(1, ), dtype=tf.float32) info['cyclomatic_complexity'] = tfds.features.TensorInfo( shape=(1, ), dtype=tf.int64) if control_flow_programs_version.supports_edge_types(): info['edge_types'] = tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64) info['data'] = tfds.features.TensorInfo(shape=(None, ), dtype=tf.float64) info['adjacency_matrix'] = tfds.features.TensorInfo(shape=(None, ), dtype=tf.float64) info['post_domination_matrix'] = tfds.features.TensorInfo( shape=(None, ), dtype=tf.float64) info['adjacency_list'] = tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64) return info
def get_program_encoder(program_generator_config): """Gets a TextEncoder for the programs that the generator config specifies.""" if program_generator_config.encoder_name == "simple": if control_flow_programs_version.at_least("0.0.42"): mod_ops1 = [ "= %", "> %", ">= %", "< %", "<= %", ] mod_ops2 = [ "if > %", "if < %", "while > %", "while < %", "if >= %", "if <= %", "while >= %", "while <= %", ] else: mod_ops1 = [] mod_ops2 = [] return SimplePythonSourceEncoder( base=program_generator_config.base, num_digits=program_generator_config.num_digits, ops=list(program_generator_config.ops) + [ "=", ">", ">=", "<", "<=", ] + mod_ops1 + [ "if", "else", "while", "if >", "if <", "while >", "while <", "if >=", "if <=", "while >=", "while <=", ] + mod_ops2 + [ "pass", "continue", "break", ], num_variables=10, # TODO(dbieber): num_variables is hardcoded. ) elif program_generator_config.encoder_name == "text": return TextSourceEncoder() else: raise ValueError("Unexpected encoder_name", program_generator_config.encoder_name)
def generate_example_from_python_object(executor, base, python_object, tokens_per_statement, target_output_length, mod, output_mod): """Generates an example dict from the program given by `python_object`. Args: executor: A python_interpreter Executor object. base: The base in which numbers are represented. python_object: Either a string representing Python source, or a tuple of the form (python_source, partial_python_source) where partial_python_source has a single line replaced with a placeholder. tokens_per_statement: The number of tokens to use to represent a statement in the encoded program. target_output_length: The length of the program output, measured in tokens. mod: The value (if any) to mod the intermediate values of the program by after each step of execution. output_mod: The value (if any) to mod the final values of the program by. Returns: An example dictionary. """ # base = info.program_generator_config.base # tokens_per_statement = info.program_encoder.tokens_per_statement # target_output_length = info.program_generator_config.num_digits # output_mod = None # try: # output_mod = info.program_generator_config.output_mod # except: # pass if isinstance(python_object, tuple): # Generate example and partial-example. python_source, partial_python_source = python_object example = _generate_example_from_python_source(executor, base, python_source, tokens_per_statement, target_output_length, mod, output_mod) partial_example = _generate_example_from_python_source( executor, base, partial_python_source, tokens_per_statement, target_output_length, mod, output_mod) # TODO(dbieber): Use a more general method for listing output fields. if control_flow_programs_version.at_least("0.0.52"): partial_example["error_type"] = example["error_type"] partial_example["target_output"] = example["target_output"] partial_example["target_output_length"] = example[ "target_output_length"] # partial_example["original_human_readable_code"] = ( # example["human_readable_code"]) partial_example["human_readable_target_output"] = ( example["human_readable_target_output"]) return partial_example else: # Just generate a full example. python_source = python_object example = _generate_example_from_python_source(executor, base, python_source, tokens_per_statement, target_output_length, mod, output_mod) # example["original_human_readable_code"] = "N/A" return example
def get_shepherd_info(self): mod_padding = 32 shepherds_info = [ shepherds_lib.NodeIndexShepherd('start_index', node_count_key='count', dtype=tf.int32), shepherds_lib.NodeIndexShepherd('exit_index', node_count_key='count', dtype=tf.int32), shepherds_lib.NodeIndicesShepherd('true_branch_nodes', node_count_key='count', dtype=tf.int64, shape=[None], mod_paddings=[mod_padding]), shepherds_lib.NodeIndicesShepherd('false_branch_nodes', node_count_key='count', dtype=tf.int64, shape=[None], mod_paddings=[mod_padding]), # shepherds_lib.DenseTensorShepherd('strings', dtype=tf.string), shepherds_lib.DenseTensorShepherd('num_nodes', dtype=tf.int64), shepherds_lib.DenseTensorShepherd('steps', dtype=tf.int64), shepherds_lib.DenseTensorShepherd('data', dtype=tf.float64, mod_paddings=[mod_padding, 1]), shepherds_lib.DenseTensorShepherd('lengths', dtype=tf.int64, mod_paddings=[mod_padding]), shepherds_lib.DenseTensorShepherd('linenos', dtype=tf.int64, mod_paddings=[mod_padding]), shepherds_lib.SparseTensorShepherd('adjacency_list'), ] if control_flow_programs_version.at_least('0.0.44'): shepherds_info.extend([ shepherds_lib.DenseTensorShepherd('cyclomatic_complexity', dtype=tf.int64), shepherds_lib.DenseTensorShepherd('max_indent', dtype=tf.float32), ]) if control_flow_programs_version.supports_edge_types(): shepherds_info.extend([ shepherds_lib.DenseTensorShepherd('edge_types', dtype=tf.int64, mod_paddings=[1]), ]) return shepherds_info
def fill(self, hole, rng): max_value = _max_value(self.config) operand = np.random.randint(max_value + 1) cond_op = np.random.choice(["<", "<=", ">", ">="]) use_mod_conds = control_flow_programs_version.at_least("0.0.42") if use_mod_conds: test = f"v0 % 10 {cond_op} {operand}" else: test = f"v0 {cond_op} {operand}" def build(body): return [f"{hole.metadata.indent_str}if {test}:"] + body block_hole = Hole(HoleType.BLOCK, dataclasses.replace( hole.metadata, indent=hole.metadata.indent + 1)) return Program(1, [block_hole], build)
def _generate_statements(length, config, rng=None): """Generates `length` statements representing a control flow program. Before 0.0.44: Mostly (90% of the time) generates statements at the requested length. A small fraction of the time generates smaller programs. The smaller the program length, the less likely it is. Version 0.0.44: Uses config.length_distribution to determine the length program to generate. Args: length: The target program length. If config.exact_length, this length will be used exactly. Otherwise, smaller programs are permitted. config: The dataset config. rng: (optional) A numpy RandomState. Returns: The list of statements in the generated program. """ if rng is None: rng = np.random.RandomState() if config.exact_length: return _generate_statements_with_length(length, config, rng) if control_flow_programs_version.at_least("0.0.44"): assert config.length_distribution is not None lengths, probabilities = zip(*config.length_distribution.items()) length = rng.choice(lengths, p=probabilities) return _generate_statements_with_length(length, config, rng) if not config.exact_length: if rng.random() < 0.10: # 90% of the time, generate the full length. while length > 2 and rng.random() > .5: # Of the remaining 10% of programs half are length - 1. # Of the still remaining 5%, half are length - 2, etc. length -= 1 return _generate_statements_with_length(length, config, rng)
def test_version_at_least(self): self.assertTrue(control_flow_programs_version.at_least('0.0.40'))
def get_tensor_info(self): """Gets the TensorInfos for the decoded Tensor features.""" tensor_info = { # The index of the start node. 'start_index': tfds.features.TensorInfo(shape=(), dtype=tf.int32), # The index of the exit node. 'exit_index': tfds.features.TensorInfo(shape=(), dtype=tf.int32), # data: For each node, a sequence of integers representing the # code at that node. 'data': tfds.features.TensorInfo(shape=(None, None), dtype=tf.float64), # For each node, the string of the code at that node. # 'strings': tfds.features.TensorInfo(shape=(None,), dtype=tf.string), # For each node, the number of elements in the data for that node. 'lengths': tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64), # For each node, the line number of the program corresponding to it. 'linenos': tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64), # The number of nodes in the graph. 'count': tfds.features.TensorInfo(shape=(), dtype=tf.int32), # Identical to 'count', but represented with shape (1,) instead of (). 'num_nodes': tfds.features.TensorInfo(shape=(1, ), dtype=tf.int64), # The number of steps needed to cover recursively every block twice. 'steps': tfds.features.TensorInfo(shape=(1, ), dtype=tf.int64), 'shape': tfds.features.TensorInfo(shape=(2, ), dtype=tf.int32), 'true_branch_nodes': tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64), 'false_branch_nodes': tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64), 'adjacency_matrix': tfds.features.TensorInfo(shape=(None, None), dtype=tf.float64), 'adjacency_matrix_shape': tfds.features.TensorInfo(shape=(2, ), dtype=tf.int32), 'post_domination_matrix': tfds.features.TensorInfo(shape=(None, None), dtype=tf.float64), 'post_domination_matrix_shape': tfds.features.TensorInfo(shape=(2, ), dtype=tf.int32), 'adjacency_list': tfds.features.TensorInfo(shape=(None, 2), dtype=tf.int64), 'adjacency_list/source_indices': tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64), 'adjacency_list/dest_indices': tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64), 'adjacency_list/dense_shape': tfds.features.TensorInfo(shape=(2, ), dtype=tf.int64), 'adjacency_list/shape': tfds.features.TensorInfo(shape=(2, ), dtype=tf.int64), } if control_flow_programs_version.at_least('0.0.44'): tensor_info.update({ # The cyclomatic complexity of the program. 'cyclomatic_complexity': tfds.features.TensorInfo(shape=(1, ), dtype=tf.int64), # The maximum level of indentation in the program. 'max_indent': tfds.features.TensorInfo(shape=(1, ), dtype=tf.float32), }) if control_flow_programs_version.supports_edge_types(): tensor_info.update({ 'edge_types': tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64), }) return tensor_info
def encode_example(self, cfg_and_python_source): cfg, python_source = cfg_and_python_source nodes = cfg.nodes lines = python_source.strip().split('\n') cyclomatic_complexity = cyclomatic_complexity_lib.cyclomatic_complexity2( cfg) # steps = 1 # Start with one step for reaching exit. # for line in lines: # indent = (len(line) - len(line.lstrip())) / constants.INDENT_SPACES # steps += 2 ** indent # if version < '0.0.38' # steps = 1 # Start with one step for reaching exit. # indents = [] # for line in lines: # indent = (len(line) - len(line.lstrip())) / constants.INDENT_SPACES # while indents and indent <= indents[-1]: # indents.pop() # steps += 2 ** len(indents) # if 'while' in line: # indents.append(indent) max_indent = 0 steps = 1 # Start with one step for reaching exit. indents = [] for line in lines: indent = (len(line) - len(line.lstrip())) / constants.INDENT_SPACES max_indent = max(max_indent, indent) while indents and indent <= indents[-1]: indents.pop() steps += 2**len(indents) if 'while' in line: indents.append(indent) # We add steps at both levels of indentation for whiles. # Before for the initial condition check, after for subsequent condition # checks. steps += 2**len(indents) # cfg.nodes does not include an exit node, so we add 1. num_nodes = len(nodes) + 1 exit_index = len(nodes) # Note that some of the nodes may have empty source. node_sources = [as_source(node, lines) for node in nodes] linenos = [node.instruction.node.lineno for node in nodes] # line_sources = python_source.strip().split('\n') if self.encoder: node_encodings = [ self.encoder.encode(source) for source in node_sources ] # line_encodings = [ # self.encoder.encode(source) # for source in line_sources # ] else: node_encodings = node_sources # line_encodings = line_sources node_encodings.append( []) # Finally add a blank line for the exit node. # line_encodings.append([]) # Finally add a blank line for the exit. linenos.append(len(lines) + 1) # Add a lineno for the exit. # Pad encodings to all be the same length. padded_encodings = [] encoding_lengths = [] for encoding in node_encodings: encoding_lengths.append(len(encoding)) max_len = max(encoding_lengths) for encoding in node_encodings: padded_encodings.append( np.pad(encoding, (0, max_len - len(encoding)), mode='constant')) padded_encodings = np.concatenate(padded_encodings) adjacency_matrix = get_adjacency_matrix(nodes, exit_index, self.include_back_edges) post_domination_matrix = get_post_domination_matrix(cfg) adjacency_list = get_adjacency_list(nodes, exit_index, self.include_back_edges) adjacency_list = np.array(adjacency_list, ndmin=2) adjacency_list.shape = (-1, 2) branch_list = np.array(get_branch_list(nodes, exit_index)) true_branch_nodes = branch_list[:, 0] false_branch_nodes = branch_list[:, 1] encoded_example = { 'start_index': [0], 'exit_index': [exit_index], 'data': padded_encodings, # 'strings': node_sources, 'lengths': encoding_lengths, 'linenos': linenos, 'steps': [steps], 'count': [num_nodes], 'num_nodes': [num_nodes], 'shape': [num_nodes, max_len], 'true_branch_nodes': true_branch_nodes, 'false_branch_nodes': false_branch_nodes, 'adjacency_matrix': np.reshape(adjacency_matrix, (-1, )), 'adjacency_matrix_shape': adjacency_matrix.shape, 'post_domination_matrix': np.reshape(post_domination_matrix, (-1, )), 'post_domination_matrix_shape': post_domination_matrix.shape, 'adjacency_list': np.reshape(adjacency_list, (-1, )), 'adjacency_list/source_indices': np.array(adjacency_list)[:, 1], 'adjacency_list/dest_indices': np.array(adjacency_list)[:, 0], 'adjacency_list/dense_shape': adjacency_matrix.shape, 'adjacency_list/shape': [len(adjacency_list), 2], } if control_flow_programs_version.at_least('0.0.44'): encoded_example.update({ 'cyclomatic_complexity': [cyclomatic_complexity], 'max_indent': [max_indent], }) if control_flow_programs_version.supports_edge_types(): encoded_example.update({ 'edge_types': get_edge_types(nodes, exit_index, self.include_back_edges) }) return encoded_example
def get_features_dict(feature_set_names, program_encoder, state_encoder, branch_encoder, text_encoder): """Returns the features dict for the requested feature sets.""" statement_features = { "statements": tfds.features.Text(encoder=program_encoder), "length": tfds.features.Tensor(shape=tuple(), dtype=tf.int32), "num_statements": tfds.features.Tensor(shape=tuple(), dtype=tf.int32), "intermediate_outputs": tfds.features.Text(encoder=state_encoder), "intermediate_outputs_mask": tfds.features.Sequence( tfds.features.Tensor(shape=tuple(), dtype=tf.bool), ), "intermediate_output_lengths": tfds.features.Sequence( tfds.features.Tensor(shape=tuple(), dtype=tf.int32), ), "intermediate_outputs_count": tfds.features.Tensor(shape=tuple(), dtype=tf.int32), "branch_decisions": tfds.features.Text(encoder=branch_encoder), "branch_decisions_mask": tfds.features.Sequence( tfds.features.Tensor(shape=tuple(), dtype=tf.bool), ), "branch_decisions_count": tfds.features.Tensor(shape=tuple(), dtype=tf.int32), } feature_sets = dict( human_readable={ "human_readable_code": tfds.features.Text(), "human_readable_target_output": tfds.features.Text(), # TODO(dbieber): Enable for the next version of the dataset. # # Used for partial programs. # "original_human_readable_code": tfds.features.Text(), }, code={ "code_" + key: value for key, value in statement_features.items() }, trace={ "trace_" + key: value for key, value in statement_features.items() }, output={ "target_output": tfds.features.Text(encoder=state_encoder), "target_output_length": tfds.features.Tensor(shape=tuple(), dtype=tf.int32), "lm_text": tfds.features.Text(encoder=text_encoder), }, cfg={ "cfg": control_flow_graph_feature.ControlFlowGraphFeature( include_back_edges=True, encoder=program_encoder), "cfg_forward": control_flow_graph_feature.ControlFlowGraphFeature( include_back_edges=False, encoder=program_encoder), }, ) if control_flow_programs_version.at_least("0.0.52"): names = ([ "NoError", "RuntimeError", # 1 second timeout "ZeroDivisionError", "AssertionError", "ValueError", "TypeError", "IndexError", "NameError", ] + (["AttributeError"] if control_flow_programs_version.at_least("0.0.57") else []) + [ "RecursionError", "MemoryError", ]) feature_sets["output"]["error_type"] = tfds.features.ClassLabel( names=names) features = {} for feature_set_name in feature_set_names: features.update(feature_sets[feature_set_name]) return tfds.features.FeaturesDict(features)