def get_serialized_info(self):
     """Return the tf-example features, as encoded for storage on disk."""
     info = self.get_tensor_info()
     # TODO(dbieber): See if you can switch to something general purpose like:
     # return {
     #     key: tfds.features.TensorInfo(shape=(None,), dtype=value.dtype)
     #     for key, value in info.items()
     # }
     info['start_index'] = tfds.features.TensorInfo(shape=(1, ),
                                                    dtype=tf.int32)
     info['exit_index'] = tfds.features.TensorInfo(shape=(1, ),
                                                   dtype=tf.int32)
     info['count'] = tfds.features.TensorInfo(shape=(1, ), dtype=tf.int32)
     info['num_nodes'] = tfds.features.TensorInfo(shape=(1, ),
                                                  dtype=tf.int64)
     info['steps'] = tfds.features.TensorInfo(shape=(1, ), dtype=tf.int64)
     if control_flow_programs_version.at_least('0.0.44'):
         info['max_indent'] = tfds.features.TensorInfo(shape=(1, ),
                                                       dtype=tf.float32)
         info['cyclomatic_complexity'] = tfds.features.TensorInfo(
             shape=(1, ), dtype=tf.int64)
     if control_flow_programs_version.supports_edge_types():
         info['edge_types'] = tfds.features.TensorInfo(shape=(None, ),
                                                       dtype=tf.int64)
     info['data'] = tfds.features.TensorInfo(shape=(None, ),
                                             dtype=tf.float64)
     info['adjacency_matrix'] = tfds.features.TensorInfo(shape=(None, ),
                                                         dtype=tf.float64)
     info['post_domination_matrix'] = tfds.features.TensorInfo(
         shape=(None, ), dtype=tf.float64)
     info['adjacency_list'] = tfds.features.TensorInfo(shape=(None, ),
                                                       dtype=tf.int64)
     return info
def get_program_encoder(program_generator_config):
  """Gets a TextEncoder for the programs that the generator config specifies."""
  if program_generator_config.encoder_name == "simple":
    if control_flow_programs_version.at_least("0.0.42"):
      mod_ops1 = [
          "= %", "> %", ">= %", "< %", "<= %",
      ]
      mod_ops2 = [
          "if > %", "if < %", "while > %", "while < %",
          "if >= %", "if <= %", "while >= %", "while <= %",
      ]
    else:
      mod_ops1 = []
      mod_ops2 = []
    return SimplePythonSourceEncoder(
        base=program_generator_config.base,
        num_digits=program_generator_config.num_digits,
        ops=list(program_generator_config.ops) + [
            "=", ">", ">=", "<", "<=",
        ] + mod_ops1 + [
            "if", "else", "while",
            "if >", "if <", "while >", "while <",
            "if >=", "if <=", "while >=", "while <=",
        ] + mod_ops2 + [
            "pass", "continue", "break",
        ],
        num_variables=10,  # TODO(dbieber): num_variables is hardcoded.
    )
  elif program_generator_config.encoder_name == "text":
    return TextSourceEncoder()
  else:
    raise ValueError("Unexpected encoder_name",
                     program_generator_config.encoder_name)
def generate_example_from_python_object(executor, base, python_object,
                                        tokens_per_statement,
                                        target_output_length, mod, output_mod):
    """Generates an example dict from the program given by `python_object`.

  Args:
    executor: A python_interpreter Executor object.
    base: The base in which numbers are represented.
    python_object: Either a string representing Python source, or a tuple of the
      form (python_source, partial_python_source) where partial_python_source
      has a single line replaced with a placeholder.
    tokens_per_statement: The number of tokens to use to represent a statement
      in the encoded program.
    target_output_length: The length of the program output, measured in tokens.
    mod: The value (if any) to mod the intermediate values of the program by
      after each step of execution.
    output_mod: The value (if any) to mod the final values of the program by.
  Returns:
    An example dictionary.
  """
    # base = info.program_generator_config.base
    # tokens_per_statement = info.program_encoder.tokens_per_statement
    # target_output_length = info.program_generator_config.num_digits
    # output_mod = None
    # try:
    #   output_mod = info.program_generator_config.output_mod
    # except:
    #   pass
    if isinstance(python_object, tuple):
        # Generate example and partial-example.
        python_source, partial_python_source = python_object
        example = _generate_example_from_python_source(executor, base,
                                                       python_source,
                                                       tokens_per_statement,
                                                       target_output_length,
                                                       mod, output_mod)
        partial_example = _generate_example_from_python_source(
            executor, base, partial_python_source, tokens_per_statement,
            target_output_length, mod, output_mod)
        # TODO(dbieber): Use a more general method for listing output fields.
        if control_flow_programs_version.at_least("0.0.52"):
            partial_example["error_type"] = example["error_type"]
        partial_example["target_output"] = example["target_output"]
        partial_example["target_output_length"] = example[
            "target_output_length"]
        # partial_example["original_human_readable_code"] = (
        #     example["human_readable_code"])
        partial_example["human_readable_target_output"] = (
            example["human_readable_target_output"])
        return partial_example
    else:
        # Just generate a full example.
        python_source = python_object
        example = _generate_example_from_python_source(executor, base,
                                                       python_source,
                                                       tokens_per_statement,
                                                       target_output_length,
                                                       mod, output_mod)
        # example["original_human_readable_code"] = "N/A"
        return example
 def get_shepherd_info(self):
     mod_padding = 32
     shepherds_info = [
         shepherds_lib.NodeIndexShepherd('start_index',
                                         node_count_key='count',
                                         dtype=tf.int32),
         shepherds_lib.NodeIndexShepherd('exit_index',
                                         node_count_key='count',
                                         dtype=tf.int32),
         shepherds_lib.NodeIndicesShepherd('true_branch_nodes',
                                           node_count_key='count',
                                           dtype=tf.int64,
                                           shape=[None],
                                           mod_paddings=[mod_padding]),
         shepherds_lib.NodeIndicesShepherd('false_branch_nodes',
                                           node_count_key='count',
                                           dtype=tf.int64,
                                           shape=[None],
                                           mod_paddings=[mod_padding]),
         # shepherds_lib.DenseTensorShepherd('strings', dtype=tf.string),
         shepherds_lib.DenseTensorShepherd('num_nodes', dtype=tf.int64),
         shepherds_lib.DenseTensorShepherd('steps', dtype=tf.int64),
         shepherds_lib.DenseTensorShepherd('data',
                                           dtype=tf.float64,
                                           mod_paddings=[mod_padding, 1]),
         shepherds_lib.DenseTensorShepherd('lengths',
                                           dtype=tf.int64,
                                           mod_paddings=[mod_padding]),
         shepherds_lib.DenseTensorShepherd('linenos',
                                           dtype=tf.int64,
                                           mod_paddings=[mod_padding]),
         shepherds_lib.SparseTensorShepherd('adjacency_list'),
     ]
     if control_flow_programs_version.at_least('0.0.44'):
         shepherds_info.extend([
             shepherds_lib.DenseTensorShepherd('cyclomatic_complexity',
                                               dtype=tf.int64),
             shepherds_lib.DenseTensorShepherd('max_indent',
                                               dtype=tf.float32),
         ])
     if control_flow_programs_version.supports_edge_types():
         shepherds_info.extend([
             shepherds_lib.DenseTensorShepherd('edge_types',
                                               dtype=tf.int64,
                                               mod_paddings=[1]),
         ])
     return shepherds_info
  def fill(self, hole, rng):
    max_value = _max_value(self.config)
    operand = np.random.randint(max_value + 1)
    cond_op = np.random.choice(["<", "<=", ">", ">="])
    use_mod_conds = control_flow_programs_version.at_least("0.0.42")
    if use_mod_conds:
      test = f"v0 % 10 {cond_op} {operand}"
    else:
      test = f"v0 {cond_op} {operand}"

    def build(body):
      return [f"{hole.metadata.indent_str}if {test}:"] + body

    block_hole = Hole(HoleType.BLOCK,
                      dataclasses.replace(
                          hole.metadata, indent=hole.metadata.indent + 1))
    return Program(1, [block_hole], build)
def _generate_statements(length, config, rng=None):
    """Generates `length` statements representing a control flow program.

  Before 0.0.44:
    Mostly (90% of the time) generates statements at the requested length.
    A small fraction of the time generates smaller programs.
    The smaller the program length, the less likely it is.
  Version 0.0.44:
    Uses config.length_distribution to determine the length program to
    generate.

  Args:
    length: The target program length. If config.exact_length, this length will
      be used exactly. Otherwise, smaller programs are permitted.
    config: The dataset config.
    rng: (optional) A numpy RandomState.
  Returns:
    The list of statements in the generated program.
  """
    if rng is None:
        rng = np.random.RandomState()

    if config.exact_length:
        return _generate_statements_with_length(length, config, rng)

    if control_flow_programs_version.at_least("0.0.44"):
        assert config.length_distribution is not None
        lengths, probabilities = zip(*config.length_distribution.items())
        length = rng.choice(lengths, p=probabilities)
        return _generate_statements_with_length(length, config, rng)

    if not config.exact_length:
        if rng.random() < 0.10:  # 90% of the time, generate the full length.
            while length > 2 and rng.random() > .5:
                # Of the remaining 10% of programs half are length - 1.
                # Of the still remaining 5%, half are length - 2, etc.
                length -= 1
    return _generate_statements_with_length(length, config, rng)
Exemple #7
0
 def test_version_at_least(self):
   self.assertTrue(control_flow_programs_version.at_least('0.0.40'))
 def get_tensor_info(self):
     """Gets the TensorInfos for the decoded Tensor features."""
     tensor_info = {
         # The index of the start node.
         'start_index':
         tfds.features.TensorInfo(shape=(), dtype=tf.int32),
         # The index of the exit node.
         'exit_index':
         tfds.features.TensorInfo(shape=(), dtype=tf.int32),
         # data: For each node, a sequence of integers representing the
         # code at that node.
         'data':
         tfds.features.TensorInfo(shape=(None, None), dtype=tf.float64),
         # For each node, the string of the code at that node.
         # 'strings': tfds.features.TensorInfo(shape=(None,), dtype=tf.string),
         # For each node, the number of elements in the data for that node.
         'lengths':
         tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64),
         # For each node, the line number of the program corresponding to it.
         'linenos':
         tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64),
         # The number of nodes in the graph.
         'count':
         tfds.features.TensorInfo(shape=(), dtype=tf.int32),
         # Identical to 'count', but represented with shape (1,) instead of ().
         'num_nodes':
         tfds.features.TensorInfo(shape=(1, ), dtype=tf.int64),
         # The number of steps needed to cover recursively every block twice.
         'steps':
         tfds.features.TensorInfo(shape=(1, ), dtype=tf.int64),
         'shape':
         tfds.features.TensorInfo(shape=(2, ), dtype=tf.int32),
         'true_branch_nodes':
         tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64),
         'false_branch_nodes':
         tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64),
         'adjacency_matrix':
         tfds.features.TensorInfo(shape=(None, None), dtype=tf.float64),
         'adjacency_matrix_shape':
         tfds.features.TensorInfo(shape=(2, ), dtype=tf.int32),
         'post_domination_matrix':
         tfds.features.TensorInfo(shape=(None, None), dtype=tf.float64),
         'post_domination_matrix_shape':
         tfds.features.TensorInfo(shape=(2, ), dtype=tf.int32),
         'adjacency_list':
         tfds.features.TensorInfo(shape=(None, 2), dtype=tf.int64),
         'adjacency_list/source_indices':
         tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64),
         'adjacency_list/dest_indices':
         tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64),
         'adjacency_list/dense_shape':
         tfds.features.TensorInfo(shape=(2, ), dtype=tf.int64),
         'adjacency_list/shape':
         tfds.features.TensorInfo(shape=(2, ), dtype=tf.int64),
     }
     if control_flow_programs_version.at_least('0.0.44'):
         tensor_info.update({
             # The cyclomatic complexity of the program.
             'cyclomatic_complexity':
             tfds.features.TensorInfo(shape=(1, ), dtype=tf.int64),
             # The maximum level of indentation in the program.
             'max_indent':
             tfds.features.TensorInfo(shape=(1, ), dtype=tf.float32),
         })
     if control_flow_programs_version.supports_edge_types():
         tensor_info.update({
             'edge_types':
             tfds.features.TensorInfo(shape=(None, ), dtype=tf.int64),
         })
     return tensor_info
    def encode_example(self, cfg_and_python_source):
        cfg, python_source = cfg_and_python_source
        nodes = cfg.nodes
        lines = python_source.strip().split('\n')

        cyclomatic_complexity = cyclomatic_complexity_lib.cyclomatic_complexity2(
            cfg)

        # steps = 1  # Start with one step for reaching exit.
        # for line in lines:
        #   indent = (len(line) - len(line.lstrip())) / constants.INDENT_SPACES
        #   steps += 2 ** indent

        # if version < '0.0.38'
        # steps = 1  # Start with one step for reaching exit.
        # indents = []
        # for line in lines:
        #   indent = (len(line) - len(line.lstrip())) / constants.INDENT_SPACES
        #   while indents and indent <= indents[-1]:
        #     indents.pop()
        #   steps += 2 ** len(indents)
        #   if 'while' in line:
        #     indents.append(indent)

        max_indent = 0
        steps = 1  # Start with one step for reaching exit.
        indents = []
        for line in lines:
            indent = (len(line) - len(line.lstrip())) / constants.INDENT_SPACES
            max_indent = max(max_indent, indent)
            while indents and indent <= indents[-1]:
                indents.pop()
            steps += 2**len(indents)
            if 'while' in line:
                indents.append(indent)
                # We add steps at both levels of indentation for whiles.
                # Before for the initial condition check, after for subsequent condition
                # checks.
                steps += 2**len(indents)

        # cfg.nodes does not include an exit node, so we add 1.
        num_nodes = len(nodes) + 1
        exit_index = len(nodes)

        # Note that some of the nodes may have empty source.
        node_sources = [as_source(node, lines) for node in nodes]
        linenos = [node.instruction.node.lineno for node in nodes]
        # line_sources = python_source.strip().split('\n')

        if self.encoder:
            node_encodings = [
                self.encoder.encode(source) for source in node_sources
            ]
            # line_encodings = [
            #     self.encoder.encode(source)
            #     for source in line_sources
            # ]
        else:
            node_encodings = node_sources
            # line_encodings = line_sources
        node_encodings.append(
            [])  # Finally add a blank line for the exit node.
        # line_encodings.append([])  # Finally add a blank line for the exit.
        linenos.append(len(lines) + 1)  # Add a lineno for the exit.

        # Pad encodings to all be the same length.
        padded_encodings = []
        encoding_lengths = []
        for encoding in node_encodings:
            encoding_lengths.append(len(encoding))
        max_len = max(encoding_lengths)
        for encoding in node_encodings:
            padded_encodings.append(
                np.pad(encoding, (0, max_len - len(encoding)),
                       mode='constant'))
        padded_encodings = np.concatenate(padded_encodings)

        adjacency_matrix = get_adjacency_matrix(nodes, exit_index,
                                                self.include_back_edges)
        post_domination_matrix = get_post_domination_matrix(cfg)
        adjacency_list = get_adjacency_list(nodes, exit_index,
                                            self.include_back_edges)
        adjacency_list = np.array(adjacency_list, ndmin=2)
        adjacency_list.shape = (-1, 2)

        branch_list = np.array(get_branch_list(nodes, exit_index))
        true_branch_nodes = branch_list[:, 0]
        false_branch_nodes = branch_list[:, 1]

        encoded_example = {
            'start_index': [0],
            'exit_index': [exit_index],
            'data': padded_encodings,
            # 'strings': node_sources,
            'lengths': encoding_lengths,
            'linenos': linenos,
            'steps': [steps],
            'count': [num_nodes],
            'num_nodes': [num_nodes],
            'shape': [num_nodes, max_len],
            'true_branch_nodes': true_branch_nodes,
            'false_branch_nodes': false_branch_nodes,
            'adjacency_matrix': np.reshape(adjacency_matrix, (-1, )),
            'adjacency_matrix_shape': adjacency_matrix.shape,
            'post_domination_matrix': np.reshape(post_domination_matrix,
                                                 (-1, )),
            'post_domination_matrix_shape': post_domination_matrix.shape,
            'adjacency_list': np.reshape(adjacency_list, (-1, )),
            'adjacency_list/source_indices': np.array(adjacency_list)[:, 1],
            'adjacency_list/dest_indices': np.array(adjacency_list)[:, 0],
            'adjacency_list/dense_shape': adjacency_matrix.shape,
            'adjacency_list/shape': [len(adjacency_list), 2],
        }
        if control_flow_programs_version.at_least('0.0.44'):
            encoded_example.update({
                'cyclomatic_complexity': [cyclomatic_complexity],
                'max_indent': [max_indent],
            })
        if control_flow_programs_version.supports_edge_types():
            encoded_example.update({
                'edge_types':
                get_edge_types(nodes, exit_index, self.include_back_edges)
            })
        return encoded_example
def get_features_dict(feature_set_names, program_encoder, state_encoder,
                      branch_encoder, text_encoder):
    """Returns the features dict for the requested feature sets."""
    statement_features = {
        "statements":
        tfds.features.Text(encoder=program_encoder),
        "length":
        tfds.features.Tensor(shape=tuple(), dtype=tf.int32),
        "num_statements":
        tfds.features.Tensor(shape=tuple(), dtype=tf.int32),
        "intermediate_outputs":
        tfds.features.Text(encoder=state_encoder),
        "intermediate_outputs_mask":
        tfds.features.Sequence(
            tfds.features.Tensor(shape=tuple(), dtype=tf.bool), ),
        "intermediate_output_lengths":
        tfds.features.Sequence(
            tfds.features.Tensor(shape=tuple(), dtype=tf.int32), ),
        "intermediate_outputs_count":
        tfds.features.Tensor(shape=tuple(), dtype=tf.int32),
        "branch_decisions":
        tfds.features.Text(encoder=branch_encoder),
        "branch_decisions_mask":
        tfds.features.Sequence(
            tfds.features.Tensor(shape=tuple(), dtype=tf.bool), ),
        "branch_decisions_count":
        tfds.features.Tensor(shape=tuple(), dtype=tf.int32),
    }
    feature_sets = dict(
        human_readable={
            "human_readable_code": tfds.features.Text(),
            "human_readable_target_output": tfds.features.Text(),
            # TODO(dbieber): Enable for the next version of the dataset.
            # # Used for partial programs.
            # "original_human_readable_code": tfds.features.Text(),
        },
        code={
            "code_" + key: value
            for key, value in statement_features.items()
        },
        trace={
            "trace_" + key: value
            for key, value in statement_features.items()
        },
        output={
            "target_output":
            tfds.features.Text(encoder=state_encoder),
            "target_output_length":
            tfds.features.Tensor(shape=tuple(), dtype=tf.int32),
            "lm_text":
            tfds.features.Text(encoder=text_encoder),
        },
        cfg={
            "cfg":
            control_flow_graph_feature.ControlFlowGraphFeature(
                include_back_edges=True, encoder=program_encoder),
            "cfg_forward":
            control_flow_graph_feature.ControlFlowGraphFeature(
                include_back_edges=False, encoder=program_encoder),
        },
    )
    if control_flow_programs_version.at_least("0.0.52"):
        names = ([
            "NoError",
            "RuntimeError",  # 1 second timeout
            "ZeroDivisionError",
            "AssertionError",
            "ValueError",
            "TypeError",
            "IndexError",
            "NameError",
        ] + (["AttributeError"]
             if control_flow_programs_version.at_least("0.0.57") else []) + [
                 "RecursionError",
                 "MemoryError",
             ])
        feature_sets["output"]["error_type"] = tfds.features.ClassLabel(
            names=names)

    features = {}
    for feature_set_name in feature_set_names:
        features.update(feature_sets[feature_set_name])
    return tfds.features.FeaturesDict(features)