Exemple #1
0
def FromBytes(
    data: bytes,
    fmt: StdinGraphFormat,
    proto: Optional[programl_pb2.ProgramGraph] = None,
    empty_okay: bool = False,
) -> programl_pb2.ProgramGraph:
    """Decode a byte array to a program graph proto.

  Args:
    data: The binary data to decode.
    fmt: The format of the binary data.
    proto: A ProgramGraph instance to reuse.
    empty_okay: If False, raise an error if the protocol buffer is not
      initialized, or contains no nodes.

  Returns:
    A program graph protocol buffer.
  """
    proto = proto or programl_pb2.ProgramGraph()
    if fmt == StdinGraphFormat.PB:
        proto.ParseFromString(data)
    elif fmt == StdinGraphFormat.PBTXT:
        pbutil.FromString(data.decode("utf-8"), proto)
    elif fmt == StdinGraphFormat.NX:
        NetworkXToProgramGraph(pickle.loads(data), proto=proto)
    else:
        raise ValueError(f"Unknown program graph format: {fmt}")

    if not empty_okay:
        if not proto.IsInitialized():
            raise ValueError("Program graph is uninitialized")
        if not proto.node:
            raise ValueError("Program graph contains no nodes")

    return proto
Exemple #2
0
 def proto(
   self, proto: programl_pb2.ProgramGraph = None
 ) -> programl_pb2.ProgramGraph:
   """Deserialize and load the protocol buffer."""
   proto = proto or programl_pb2.ProgramGraph()
   proto.ParseFromString(self.data.serialized_proto)
   return proto
Exemple #3
0
def SerializedProgramGraphToBytes(serialized_proto: bytes,
                                  fmt: StdoutGraphFormat) -> bytes:
    """Convert a serialized ProgramGraphProto to a byte array.

  Args:
    serialized_proto: The serialized program graph proto.
    fmt: The output format of the byte array.

  Returns:
    An array of bytes.
  """
    if fmt == StdoutGraphFormat.PB:
        return serialized_proto
    proto = programl_pb2.ProgramGraph()
    proto.ParseFromString(serialized_proto)
    return ToBytes(proto, fmt)
Exemple #4
0
def test_proto_networkx_equivalence_with_preallocated_proto(
    random_100_proto: programl_pb2.ProgramGraph, ):
    """Test proto -> networkx -> proto on 100 "real" graphs using the same
  proto instance."""
    # proto -> networkx
    g = programl.ProgramGraphToNetworkX(random_100_proto)
    assert g.number_of_nodes() == len(random_100_proto.node)
    assert g.number_of_edges() == len(random_100_proto.edge)

    # networkx -> proto
    # Allocate the proto ahead of time:
    proto_out = programl_pb2.ProgramGraph()
    programl.NetworkXToProgramGraph(g, proto=proto_out)
    assert proto_out.function == random_100_proto.function
    assert proto_out.node == random_100_proto.node
    assert proto_out.edge == random_100_proto.edge
Exemple #5
0
def test_proto_networkx_equivalence_with_preallocated_proto(
  llvm_program_graph: programl_pb2.ProgramGraph,
):
  """Test proto -> networkx -> proto equivalent using pre-allocated protos."""
  # proto -> networkx
  g = programl.ProgramGraphToNetworkX(llvm_program_graph)
  assert g.number_of_nodes() == len(llvm_program_graph.node)
  assert g.number_of_edges() == len(llvm_program_graph.edge)

  # networkx -> proto
  # Allocate the proto ahead of time:
  proto_out = programl_pb2.ProgramGraph()
  programl.NetworkXToProgramGraph(g, proto=proto_out)
  assert set(fn.name for fn in proto_out.function) == set(
    fn.name for fn in llvm_program_graph.function
  )
  assert len(llvm_program_graph.node) == len(proto_out.node)
  assert len(llvm_program_graph.edge) == len(proto_out.edge)
Exemple #6
0
def NetworkXToProgramGraph(
    g: nx.MultiDiGraph,
    proto: Optional[programl_pb2.ProgramGraph] = None,
    **proto_fields,
) -> programl_pb2.ProgramGraph:
    """Perform the inverse transformation from networkx graph -> protobuf.

  See ProgramGraphToNetworkX() for details.

  Arguments:
    g: A networkx graph.
    proto: An optional protocol buffer instance to use. Else a new one is
      created. Calling code is reponsible for clearning the protocol buffer.
    **proto_fields: Optional keyword arguments to use when constructing a proto.
      Has no effect if proto argument is set.

  Returns:
    A ProgramGraph proto instance.
  """
    proto = proto or programl_pb2.ProgramGraph(**proto_fields)

    # Create a map from function name to function ID.
    function_names = list(
        sorted(set([fn for _, fn in g.nodes(data="function") if fn])))
    function_to_idx_map = {fn: i for i, fn in enumerate(function_names)}

    # Create the function protos.
    for function_name in function_names:
        function_proto = proto.function.add()
        function_proto.name = function_name
        # Function-level LLVM profiling info.
        if ("llvm_function_entry_count" in g.graph
                and function_name in g.graph["llvm_function_entry_count"]):
            function_proto.llvm_entry_count = g.graph[
                "llvm_function_entry_count"][function_name]

    # Set the graph-level features and labels.
    proto.x[:] = np.array(g.graph["x"], dtype=np.int64).tolist()
    proto.y[:] = np.array(g.graph["y"], dtype=np.int64).tolist()
    if "data_flow_root_node" in g.graph:
        proto.data_flow_root_node = g.graph["data_flow_root_node"]
    if "data_flow_steps" in g.graph:
        proto.data_flow_steps = g.graph["data_flow_steps"]
    if "data_flow_positive_node_count" in g.graph:
        proto.data_flow_positive_node_count = g.graph[
            "data_flow_positive_node_count"]
    # Graph-level LLVM profiling info.
    if "llvm_profile_num_functions" in g.graph:
        proto.llvm_profile.num_functions = g.graph[
            "llvm_profile_num_functions"]
    if "llvm_profile_max_function_count" in g.graph:
        proto.llvm_profile.max_function_count = g.graph[
            "llvm_profile_max_function_count"]
    if "llvm_profile_num_counts" in g.graph:
        proto.llvm_profile.num_counts = g.graph["llvm_profile_num_counts"]
    if "llvm_profile_total_count" in g.graph:
        proto.llvm_profile.total_count = g.graph["llvm_profile_total_count"]
    if "llvm_profile_max_count" in g.graph:
        proto.llvm_profile.max_count = g.graph["llvm_profile_max_count"]
    if "llvm_profile_max_internal_count" in g.graph:
        proto.llvm_profile.max_internal_count = g.graph[
            "llvm_profile_max_internal_count"]

    # Create the node list.
    for node, data in g.nodes(data=True):
        node_proto = proto.node.add()
        node_proto.type = data["type"]
        node_proto.text = data["text"]
        node_proto.preprocessed_text = data["preprocessed_text"]
        if data["function"] is not None:
            node_proto.function = function_to_idx_map[data["function"]]
        node_proto.x[:] = np.array(data["x"], dtype=np.int64).tolist()
        node_proto.y[:] = np.array(data["y"], dtype=np.int64).tolist()
        # Node-level LLVM profiling info.
        if data.get("llvm_profile_true_weight") is not None:
            node_proto.llvm_profile_true_weight = data[
                "llvm_profile_true_weight"]
        if data.get("llvm_profile_false_weight") is not None:
            node_proto.llvm_profile_false_weight = data[
                "llvm_profile_false_weight"]
        if data.get("llvm_profile_total_weight") is not None:
            node_proto.llvm_profile_total_weight = data[
                "llvm_profile_total_weight"]

    # Create the edge list.
    for src, dst, data in g.edges(data=True):
        edge_proto = proto.edge.add()
        edge_proto.source_node = src
        edge_proto.destination_node = dst
        edge_proto.flow = data["flow"]
        edge_proto.position = data["position"]

    return proto
Exemple #7
0
def NetworkXGraphToProgramGraphProto(
    g: nx.MultiDiGraph, ) -> programl_pb2.ProgramGraph:
    """Convert a networkx graph constructed using the old control-and-data-flow
  graph builder to a ProGraML graph proto."""
    proto = programl_pb2.ProgramGraph()

    # Create the map from function IDs to function names.
    function_names = list(
        sorted(set([fn for _, fn in g.nodes(data="function") if fn])))
    function_to_idx_map = {fn: i for i, fn in enumerate(function_names)}

    # Create the function list.
    for function_name in function_names:
        function_proto = proto.function.add()
        function_proto.name = function_name

    # Build a translation map from node names to node list indices.
    if "root" not in g.nodes:
        raise ValueError(f"Graph has no root node: {g.nodes}")
    node_to_idx_map = {"root": 0}
    for node in [node for node in g.nodes if node != "root"]:
        node_to_idx_map[node] = len(node_to_idx_map)

    # Create the node list.
    idx_to_node_map = {v: k for k, v in node_to_idx_map.items()}
    for node_idx in range(len(node_to_idx_map)):
        node = g.nodes[idx_to_node_map[node_idx]]
        node_proto = proto.node.add()

        # Translate node attributes.
        node_type = node.get("type")
        if not node_type:
            raise ValueError(f"Node has no type: {node_type}")
        node_proto.type = {
            "statement": programl_pb2.Node.STATEMENT,
            "identifier": programl_pb2.Node.IDENTIFIER,
            "immediate": programl_pb2.Node.IMMEDIATE,
            # We are removing the "magic" node type, replacing them with a regular
            # statement of unknown type.
            "magic": programl_pb2.Node.STATEMENT,
        }[node_type]

        # Get the text of the node.
        if "original_text" in node:
            node_proto.text = node["original_text"]
            node_proto.preprocessed_text = node["text"]
        elif "text" in node:
            node_proto.text = node["text"]
            node_proto.preprocessed_text = node["text"]
        elif "name" in node:
            node_proto.text = node["name"]
            node_proto.preprocessed_text = node["name"]
        else:
            raise ValueError(f"Node has no original_text or name: {node}")

        # Set the encoded representation of the node.
        x = node.get("x", None)
        if x is not None:
            node_proto.x.extend([x])

        # Set the node function.
        function = node.get("function")
        if function:
            node_proto.function = function_to_idx_map[function]

    # Create the edge list.
    for src, dst, data in g.edges(data=True):
        edge = proto.edge.add()
        edge.flow = {
            "call": programl_pb2.Edge.CALL,
            "control": programl_pb2.Edge.CONTROL,
            "data": programl_pb2.Edge.DATA,
        }[data["flow"]]
        edge.source_node = node_to_idx_map[src]
        edge.destination_node = node_to_idx_map[dst]
        edge.position = data.get("position", 0)

    return proto
Exemple #8
0
def CreateRandomProto(
  node_x_dimensionality: int = 1,
  node_y_dimensionality: int = 0,
  graph_x_dimensionality: int = 0,
  graph_y_dimensionality: int = 0,
  with_data_flow: bool = False,
  node_count: int = None,
) -> programl_pb2.ProgramGraph:
  """Generate a random program graph.

  This generates a random graph which has sensible values for fields, but does
  not have meaningful semantics, e.g. there may be data flow edges between
  identifiers, etc. For speed, this generator guarantees only that:

    1. There is a 'root' node with outgoing call edges.
    2. Nodes are either statements, identifiers, or immediates.
    3. Nodes have text, preprocessed_text, and a single node_x value.
    4. Edges are either control, data, or call.
    5. Edges have positions.
    6. The graph is strongly connected.
  """
  node_count = node_count or random.randint(5, 50)

  if node_count < 2:
    raise ValueError("node_count < 2")

  proto = programl_pb2.ProgramGraph()

  def _RandomDst(src: int) -> int:
    """Select a random destination node for the given source."""
    n = random.randint(0, node_count - 1)
    if n == src:
      return _RandomDst(src)
    else:
      return n

  function_count = 0

  # Create the nodes.
  for i in range(node_count):
    node = proto.node.add()
    if i:
      node.type = np.random.choice(
        [
          programl_pb2.Node.STATEMENT,
          programl_pb2.Node.IDENTIFIER,
          programl_pb2.Node.IMMEDIATE,
        ],
        p=[0.45, 0.3, 0.25],
      )
      if node.type == programl_pb2.Node.STATEMENT:
        node.text = "statement"
        node.preprocessed_text = "!UNK"
        # Assign the node to a function, or create a new function.
        if function_count and random.random() < 0.85:
          node.function = random.randint(0, function_count - 1)
        else:
          function_count += 1
          node.function = function_count - 1
      elif node.type == programl_pb2.Node.IDENTIFIER:
        node.text = "%0"
        node.preprocessed_text = "!IDENTIFIER"
      else:
        node.text = "0"
        node.preprocessed_text = "!IDENTIFIER"
    else:
      # The first node is always the root.
      node.type = programl_pb2.Node.STATEMENT
      node.text = "root"
      node.preprocessed_text = "!UNK"

    # Add the node features and labels.
    # Limit node feature values in range [0,1] to play nicely with models with
    # hardcoded "binary selector" embeddings.
    node.x[:] = np.random.randint(low=0, high=2, size=node_x_dimensionality)
    if node_y_dimensionality:
      node.y[:] = np.random.randint(low=0, high=100, size=node_y_dimensionality)

  # Create the functions.
  for i in range(0, function_count):
    function = proto.function.add()
    # In NetworkXToProgramGraph(), functions are sorted lexicographically by
    # their name. To preserve equivalence between proto <-> nx function names,
    # we create zero-padded function names, e.g. function 10 -> fn_000010.
    # This will not work if the number of digits required to name the functions
    # overflows the padding size! I.e. if there are more than 999999 function
    # names in a randomly generated proto.
    function.name = f"fn_{i + 1:06d}"

  # Keep track of the edges that we have created to avoid generating parallel
  # edges of the same flow.
  edges: Set[Tuple[int, int, programl_pb2.Edge.Flow]] = set()

  # Create the edges.
  for src, node in enumerate(proto.node):
    outgoing_edge_count = random.randint(1, 3)
    for _ in range(outgoing_edge_count):
      dst = _RandomDst(src)

      # Determine the flow based on the source node type.
      if src:
        if node.type == programl_pb2.Node.STATEMENT:
          flow = np.random.choice(
            [programl_pb2.Edge.CONTROL, programl_pb2.Edge.CALL], p=[0.9, 0.1]
          )
        else:
          flow = programl_pb2.Edge.DATA
      else:
        flow = programl_pb2.Edge.CALL

      if (src, dst, flow) not in edges:
        edges.add((src, dst, flow))

        edge = proto.edge.add()
        edge.flow = flow
        edge.source_node = src
        edge.destination_node = dst
        edge.position = random.randint(0, 4)

  if graph_x_dimensionality:
    proto.x[:] = np.random.randint(low=0, high=100, size=graph_x_dimensionality)

  if graph_y_dimensionality:
    proto.y[:] = np.random.randint(low=0, high=100, size=graph_y_dimensionality)

  if with_data_flow:
    proto.data_flow_steps = random.randint(1, 50)
    proto.data_flow_root_node = random.randint(0, node_count - 1)
    proto.data_flow_positive_node_count = random.randint(1, node_count - 1)

  return proto
Exemple #9
0
def BuildProgramGraphProto(
    module: str,
    timeout: int = 120,
    graph: Optional[programl_pb2.ProgramGraph] = None,
) -> programl_pb2.ProgramGraph:
    """Construct a program graph for the given LLVM IR.

  Args:
    module: The LLVM IR string for a module.
    timeout: The maximum number of seconds to allow graph construction to run
      for.
    graph: An existing graph message to write the result to. If not provided,
      a new graph message is constructed.

  Returns:
    A ProgramGraph message instance.

  Raises:
    TimeoutError: If graph construction fails to complete within timeout
      seconds.
    ValueError: If graph construction fails.
  """
    # NOTE: Ideally we would wrap the ml4pl::BuildProto() C++ function using
    # pybind11 and call directly into it (see
    # //deeplearning/ml4pl/graphs:graphviz_conter_py for an example of this).
    # However, I have so far been unable to make a pybind11 module work due to
    # double free / heap corruption errors when linking both pybind11 and LLVM
    # libraries in a single library. Because of this, we instead call the C++
    # binary as a subprocess and feed deserialize the protocol buffer output
    # from stdout. This has a higher overhead (requiring an extra proto serialize
    # and deserialize per call).

    graph = graph or programl_pb2.ProgramGraph()

    # Build and execute a llvm2graph command.
    cmd = [
        "timeout",
        "-s9",
        str(timeout),
        str(LLVM2GRAPH),
        "-",
        "--stdout_fmt",
        "pb",
    ]
    process = subprocess.Popen(cmd,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)

    # Pass the module input to the binary.
    stdout, stderr = process.communicate(module.encode("utf-8"))

    # Handle an error in the llvm2graph command.
    if process.returncode == 9 or process.returncode == -9:
        raise TimeoutError(f"llvm2graph took longer than {timeout} seconds on "
                           f"{humanize.BinaryPrefix(len(module), 'B')} input")
    if process.returncode:
        message = "unknown error"
        try:
            message = stderr.decode("utf-8")
        finally:
            raise ValueError(message)

    # Parse the binary graph.
    graph.ParseFromString(stdout)
    return graph