Ejemplo n.º 1
0
    def EncodeNodes(self, g: nx.DiGraph, ir: Optional[str] = None) -> None:
        """Pre-process the node text and set the text embedding index.

    For each node, this sets the 'preprocessed_text', 'x', and 'y' attributes.

    Args:
      g: The graph to encode the nodes of.
      ir: The LLVM IR that was used to construct the graph. This is required for
        struct inlining. If struct inlining is not required, this may be
        omitted.
    """
        # Pre-process the statements of the graph in a single pass.
        lines = [[data["text"]] for _, data in g.nodes(data=True)
                 if data["type"] == programl_pb2.Node.STATEMENT]

        if ir:
            # NOTE(github.com/ChrisCummins/ProGraML/issues/57): Extract the struct
            # definitions from the IR and inline their definitions in place of the
            # struct names. This is brittle string substitutions, in the future we
            # should do this inlining in llvm2graph where we have a parsed
            # llvm::Module.
            structs = inst2vec_preprocess.GetStructTypes(ir)
            for line in lines:
                for struct, definition in structs.items():
                    line[0] = line[0].replace(struct, definition)

        preprocessed_lines, _ = inst2vec_preprocess.preprocess(lines)
        preprocessed_texts = [
            inst2vec_preprocess.PreprocessStatement(x[0] if len(x) else "")
            for x in preprocessed_lines
        ]
        for (_, data), text in zip(
            ((_, data) for _, data in g.nodes(data=True)
             if data["type"] == programl_pb2.Node.STATEMENT),
                preprocessed_texts,
        ):
            data["preprocessed_text"] = text
            data["x"] = [self.dictionary.get(text, self.dictionary["!UNK"])]

        # Re-write the remaining graph nodes.
        for node, data in g.nodes(data=True):
            if data["type"] == programl_pb2.Node.IDENTIFIER:
                data["preprocessed_text"] = "!IDENTIFIER"
                data["x"] = [self.dictionary["!IDENTIFIER"]]
            elif data["type"] == programl_pb2.Node.IMMEDIATE:
                data["preprocessed_text"] = "!IMMEDIATE"
                data["x"] = [self.dictionary["!IMMEDIATE"]]

            data["y"] = []
Ejemplo n.º 2
0
    def EncodeNodes(self, g: nx.DiGraph) -> None:
        """Pre-process the node text and set the text embedding index.

    For each node, this sets the 'preprocessed_text' and 'x' attributes.

    Args:
      g: The graph to encode the nodes of. We assume the nodes to have 'text', i.e. lines of LLVM code.
    """
        # get the 'text' for all statement nodes.
        lines = [[data["text"]] for _, data in g.nodes(data=True)
                 if data["type"] == programl_pb2.Node.STATEMENT]
        # clean the lines
        preprocessed_lines, _ = inst2vec_preprocess.preprocess(lines)
        # canonicalize the lines by removing identifier names etc.
        preprocessed_texts = [
            # maybe lines were cleaned away completely, so test for ''
            inst2vec_preprocess.PreprocessStatement(x[0] if len(x) else "")
            for x in preprocessed_lines
        ]
        # write canonicalized texts back to the nx graph
        for (node, data), text in zip(g.nodes(data=True), preprocessed_texts):
            if text:
                data["preprocessed_text"] = text
                data[
                    "type"] = programl_pb2.Node.STATEMENT  # thats just int(0).
                #TODO (ZACH) here I changed a line and fixed a bug.
                # data["x"] = [self.dictionary.get(data["text"], self.dictionary["!UNK"])]

                # look up canonicalized statement text in dictionary, o/w !UNK
                data["x"] = [
                    self.dictionary.get(data["preprocessed_text"],
                                        self.dictionary["!UNK"])
                ]
                data["y"] = []
            else:  # cleaned away lines on statements shall be !UNK.
                data["preprocessed_text"] = "!UNK"
                data["type"] = programl_pb2.Node.STATEMENT
                data["x"] = [
                    self.dictionary.get(data["text"], self.dictionary["!UNK"])
                ]
                data["y"] = []
Ejemplo n.º 3
0
def EncodeLlvmBytecode(
        bytecode: str,
        vocab: vocabulary.VocabularyZipFile) -> typing.List[int]:
    """Encode an LLVM bytecode to an array of vocabulary indices."""
    bytecode_lines = bytecode.split("\n")
    preprocessed_lines, functions_declared_in_files = preprocess.preprocess(
        [bytecode_lines])
    preprocessed_lines = preprocessed_lines[0]

    # TODO(cec): inline_struct_types_txt

    # Abstract identifiers from statements.
    preprocessed_lines = [
        preprocess.PreprocessStatement(statement)
        for statement in preprocessed_lines
    ]

    # Translate from statement to encoded token.
    return [
        vocab.dictionary.get(statement, vocab.dictionary[rgx.unknown_token])
        for statement in preprocessed_lines if not re.match(
            r"((?:<label>:)?(<LABEL>):|:; <label>:<LABEL>)", statement)
    ]
Ejemplo n.º 4
0
    def EncodeNodes(self, g: nx.DiGraph) -> None:
        """Pre-process the node text and set the text embedding index.

    For each node, this sets the 'preprocessed_text' and 'x' attributes.

    Args:
      g: The graph to encode the nodes of.
    """
        lines = [[data["text"]] for _, data in g.nodes(data=True)
                 if data["type"] == programl_pb2.Node.STATEMENT]
        preprocessed_lines, _ = inst2vec_preprocess.preprocess(lines)
        preprocessed_texts = [
            inst2vec_preprocess.PreprocessStatement(x[0] if len(x) else "")
            for x in preprocessed_lines
        ]
        for (node, data), text in zip(g.nodes(data=True), preprocessed_texts):
            if text:
                data["preprocessed_text"] = text
                data["type"] = programl_pb2.Node.STATEMENT
                data["x"] = [self.dictionary.get(data["text"], "!UNK")]
            else:
                data["preprocessed_text"] = "!UNK"
                data["type"] = programl_pb2.Node.STATEMENT
                data["x"] = [self.dictionary.get(data["text"], "!UNK")]
Ejemplo n.º 5
0
    def Encode(self,
               proto: program_graph_pb2.ProgramGraph,
               ir: Optional[str] = None) -> program_graph_pb2.ProgramGraph:
        """Pre-process the node text and set the text embedding index.

    For each node, this sets 'inst2vec_preprocessed' and 'inst2vec_embedding'
    features.

    Args:
      proto: The ProgramGraph to encode.
      ir: The LLVM IR that was used to construct the graph. This is required for
        struct inlining. If struct inlining is not required, this may be
        omitted.

    Returns:
      The input proto.
    """
        # Gather the instruction texts to pre-process.
        lines = [[NodeFullText(node)] for node in proto.node
                 if node.type == node_pb2.Node.INSTRUCTION]

        if ir:
            # NOTE(github.com/ChrisCummins/ProGraML/issues/57): Extract the struct
            # definitions from the IR and inline their definitions in place of the
            # struct names. These is brittle string substitutions, in the future we
            # should do this inlining in llvm2graph where we have a parsed
            # llvm::Module.
            try:
                structs = inst2vec_preprocess.GetStructTypes(ir)
                for line in lines:
                    for struct, definition in structs.items():
                        line[0] = line[0].replace(struct, definition)
            except ValueError:
                pass

        preprocessed_lines, _ = inst2vec_preprocess.preprocess(lines)
        preprocessed_texts = [
            inst2vec_preprocess.PreprocessStatement(x[0] if len(x) else "")
            for x in preprocessed_lines
        ]

        # Add the node features.
        var_embedding = self.dictionary["!IDENTIFIER"]
        const_embedding = self.dictionary["!IMMEDIATE"]

        text_index = 0
        for node in proto.node:
            if node.type == node_pb2.Node.INSTRUCTION:
                text = preprocessed_texts[text_index].encode("utf-8")
                text_index += 1
                embedding = self.dictionary.get(text, self.dictionary["!UNK"])
                node.features.feature[
                    "inst2vec_preprocessed"].bytes_list.value.append(text)
                node.features.feature[
                    "inst2vec_embedding"].int64_list.value.append(embedding)
            elif node.type == node_pb2.Node.VARIABLE:
                node.features.feature[
                    "inst2vec_embedding"].int64_list.value.append(
                        var_embedding)
            elif node.type == node_pb2.Node.CONSTANT:
                node.features.feature[
                    "inst2vec_embedding"].int64_list.value.append(
                        const_embedding)

        proto.features.feature["inst2vec_annotated"].int64_list.value.append(1)
        return proto