Ejemplo n.º 1
0
    def EncodeNodes(self, g: nx.DiGraph, ir: Optional[str] = None) -> None:
        """Pre-process the node text and set the text embedding index.

    For each node, this sets the 'preprocessed_text', 'x', and 'y' attributes.

    Args:
      g: The graph to encode the nodes of.
      ir: The LLVM IR that was used to construct the graph. This is required for
        struct inlining. If struct inlining is not required, this may be
        omitted.
    """
        # Pre-process the statements of the graph in a single pass.
        lines = [[data["text"]] for _, data in g.nodes(data=True)
                 if data["type"] == programl_pb2.Node.STATEMENT]

        if ir:
            # NOTE(github.com/ChrisCummins/ProGraML/issues/57): Extract the struct
            # definitions from the IR and inline their definitions in place of the
            # struct names. This is brittle string substitutions, in the future we
            # should do this inlining in llvm2graph where we have a parsed
            # llvm::Module.
            structs = inst2vec_preprocess.GetStructTypes(ir)
            for line in lines:
                for struct, definition in structs.items():
                    line[0] = line[0].replace(struct, definition)

        preprocessed_lines, _ = inst2vec_preprocess.preprocess(lines)
        preprocessed_texts = [
            inst2vec_preprocess.PreprocessStatement(x[0] if len(x) else "")
            for x in preprocessed_lines
        ]
        for (_, data), text in zip(
            ((_, data) for _, data in g.nodes(data=True)
             if data["type"] == programl_pb2.Node.STATEMENT),
                preprocessed_texts,
        ):
            data["preprocessed_text"] = text
            data["x"] = [self.dictionary.get(text, self.dictionary["!UNK"])]

        # Re-write the remaining graph nodes.
        for node, data in g.nodes(data=True):
            if data["type"] == programl_pb2.Node.IDENTIFIER:
                data["preprocessed_text"] = "!IDENTIFIER"
                data["x"] = [self.dictionary["!IDENTIFIER"]]
            elif data["type"] == programl_pb2.Node.IMMEDIATE:
                data["preprocessed_text"] = "!IMMEDIATE"
                data["x"] = [self.dictionary["!IMMEDIATE"]]

            data["y"] = []
Ejemplo n.º 2
0
def test_GetStructTypes_no_structs():
    """Test an IR with no struct definitions."""
    ir = """
define i32 @main(i32, i8**) #0 {
  %3 = alloca i32, align 4
  %4 = alloca i32, align 4
  %5 = alloca i8**, align 8
  store i32 0, i32* %3, align 4
  store i32 %0, i32* %4, align 4
  store i8** %1, i8*** %5, align 8
  ret i32 0
}
"""
    structs = inst2vec_preprocess.GetStructTypes(ir)
    assert structs == {}
Ejemplo n.º 3
0
def test_GetStructTypes_one_struct():
    """Test extraction of a single struct definition."""
    ir = """
%struct.foo = type { i32, i8* }

define i32 @A(%struct.foo*) #0 {
  %2 = alloca %struct.foo*, align 8
  store %struct.foo* %0, %struct.foo** %2, align 8
  %3 = load %struct.foo*, %struct.foo** %2, align 8
  %4 = getelementptr inbounds %struct.foo, %struct.foo* %3, i32 0, i32 0
  %5 = load i32, i32* %4, align 8
  ret i32 %5
}
"""
    structs = inst2vec_preprocess.GetStructTypes(ir)
    assert structs == {"%struct.foo": "{ i32, i8* }"}
Ejemplo n.º 4
0
def test_GetStructTypes_nested_structs():
    """Test extraction of a nested struct definition."""
    ir = """
%struct.bar = type { %struct.foo* }
%struct.foo = type { i32 }

; Function Attrs: noinline nounwind optnone uwtable
define void @Foo(%struct.bar*) #0 {
  %2 = alloca %struct.bar*, align 8
  store %struct.bar* %0, %struct.bar** %2, align 8
  ret void
}
"""
    structs = inst2vec_preprocess.GetStructTypes(ir)
    assert structs == {
        "%struct.bar": "{ { i32 }* }",
        "%struct.foo": "{ i32 }",
    }
Ejemplo n.º 5
0
    def Encode(self,
               proto: program_graph_pb2.ProgramGraph,
               ir: Optional[str] = None) -> program_graph_pb2.ProgramGraph:
        """Pre-process the node text and set the text embedding index.

    For each node, this sets 'inst2vec_preprocessed' and 'inst2vec_embedding'
    features.

    Args:
      proto: The ProgramGraph to encode.
      ir: The LLVM IR that was used to construct the graph. This is required for
        struct inlining. If struct inlining is not required, this may be
        omitted.

    Returns:
      The input proto.
    """
        # Gather the instruction texts to pre-process.
        lines = [[NodeFullText(node)] for node in proto.node
                 if node.type == node_pb2.Node.INSTRUCTION]

        if ir:
            # NOTE(github.com/ChrisCummins/ProGraML/issues/57): Extract the struct
            # definitions from the IR and inline their definitions in place of the
            # struct names. These is brittle string substitutions, in the future we
            # should do this inlining in llvm2graph where we have a parsed
            # llvm::Module.
            try:
                structs = inst2vec_preprocess.GetStructTypes(ir)
                for line in lines:
                    for struct, definition in structs.items():
                        line[0] = line[0].replace(struct, definition)
            except ValueError:
                pass

        preprocessed_lines, _ = inst2vec_preprocess.preprocess(lines)
        preprocessed_texts = [
            inst2vec_preprocess.PreprocessStatement(x[0] if len(x) else "")
            for x in preprocessed_lines
        ]

        # Add the node features.
        var_embedding = self.dictionary["!IDENTIFIER"]
        const_embedding = self.dictionary["!IMMEDIATE"]

        text_index = 0
        for node in proto.node:
            if node.type == node_pb2.Node.INSTRUCTION:
                text = preprocessed_texts[text_index].encode("utf-8")
                text_index += 1
                embedding = self.dictionary.get(text, self.dictionary["!UNK"])
                node.features.feature[
                    "inst2vec_preprocessed"].bytes_list.value.append(text)
                node.features.feature[
                    "inst2vec_embedding"].int64_list.value.append(embedding)
            elif node.type == node_pb2.Node.VARIABLE:
                node.features.feature[
                    "inst2vec_embedding"].int64_list.value.append(
                        var_embedding)
            elif node.type == node_pb2.Node.CONSTANT:
                node.features.feature[
                    "inst2vec_embedding"].int64_list.value.append(
                        const_embedding)

        proto.features.feature["inst2vec_annotated"].int64_list.value.append(1)
        return proto