Beispiel #1
0
def LlvmBytecodeToContextualFlowGraph(bytecode: str) -> nx.DiGraph:
    """
  Given a file of source code, construct a context graph.

  Args:
    bytecode: The input bytecode.
    vocab: The vocabulary.

  Returns:
    A <digraph, multi_edge_list> tuple, where <digraph> is a directed graph in
    which nodes are identifiers or ad-hoc and edges are statements which is
    meant as a representation of both data and flow control of the code
    capturing the notion of context; and <multi_edge_list> is a dictionary of
    edges that have parallel edges.
  """
    # First preprocess the bytecode, with the side-effect of getting the list of
    # function declarations.
    bytecode_lines = bytecode.split("\n")
    preprocessed_bytecodes, functions_declared_in_files = preprocess.preprocess(
        [bytecode_lines])
    preprocessed_bytecode = preprocessed_bytecodes[0]
    functions_declared_in_file = functions_declared_in_files[0]

    # Then build the XFG from the preprocessed bytecode.
    #
    # File name is required by BuildContextualFlowGraph(), but is used only to
    # produce descriptive error messages, so any value will do.
    xfg, multi_edges = preprocess.BuildContextualFlowGraph(
        preprocessed_bytecode, functions_declared_in_file, filename="[input]")
    del multi_edges  # unused
    return xfg
Beispiel #2
0
def PreprocessLlvmBytecode(bytecode: str) -> str:
    """Pre-process an LLVM bytecode for encoding."""
    bytecode_lines = bytecode.split("\n")
    preprocessed, functions_declared_in_files = preprocess.preprocess(
        [bytecode_lines])
    del functions_declared_in_files
    return "\n".join(preprocessed[0])
Beispiel #3
0
    def EncodeNodes(self, g: nx.DiGraph, ir: Optional[str] = None) -> None:
        """Pre-process the node text and set the text embedding index.

    For each node, this sets the 'preprocessed_text', 'x', and 'y' attributes.

    Args:
      g: The graph to encode the nodes of.
      ir: The LLVM IR that was used to construct the graph. This is required for
        struct inlining. If struct inlining is not required, this may be
        omitted.
    """
        # Pre-process the statements of the graph in a single pass.
        lines = [[data["text"]] for _, data in g.nodes(data=True)
                 if data["type"] == programl_pb2.Node.STATEMENT]

        if ir:
            # NOTE(github.com/ChrisCummins/ProGraML/issues/57): Extract the struct
            # definitions from the IR and inline their definitions in place of the
            # struct names. This is brittle string substitutions, in the future we
            # should do this inlining in llvm2graph where we have a parsed
            # llvm::Module.
            structs = inst2vec_preprocess.GetStructTypes(ir)
            for line in lines:
                for struct, definition in structs.items():
                    line[0] = line[0].replace(struct, definition)

        preprocessed_lines, _ = inst2vec_preprocess.preprocess(lines)
        preprocessed_texts = [
            inst2vec_preprocess.PreprocessStatement(x[0] if len(x) else "")
            for x in preprocessed_lines
        ]
        for (_, data), text in zip(
            ((_, data) for _, data in g.nodes(data=True)
             if data["type"] == programl_pb2.Node.STATEMENT),
                preprocessed_texts,
        ):
            data["preprocessed_text"] = text
            data["x"] = [self.dictionary.get(text, self.dictionary["!UNK"])]

        # Re-write the remaining graph nodes.
        for node, data in g.nodes(data=True):
            if data["type"] == programl_pb2.Node.IDENTIFIER:
                data["preprocessed_text"] = "!IDENTIFIER"
                data["x"] = [self.dictionary["!IDENTIFIER"]]
            elif data["type"] == programl_pb2.Node.IMMEDIATE:
                data["preprocessed_text"] = "!IMMEDIATE"
                data["x"] = [self.dictionary["!IMMEDIATE"]]

            data["y"] = []
Beispiel #4
0
    def EncodeNodes(self, g: nx.DiGraph) -> None:
        """Pre-process the node text and set the text embedding index.

    For each node, this sets the 'preprocessed_text' and 'x' attributes.

    Args:
      g: The graph to encode the nodes of. We assume the nodes to have 'text', i.e. lines of LLVM code.
    """
        # get the 'text' for all statement nodes.
        lines = [[data["text"]] for _, data in g.nodes(data=True)
                 if data["type"] == programl_pb2.Node.STATEMENT]
        # clean the lines
        preprocessed_lines, _ = inst2vec_preprocess.preprocess(lines)
        # canonicalize the lines by removing identifier names etc.
        preprocessed_texts = [
            # maybe lines were cleaned away completely, so test for ''
            inst2vec_preprocess.PreprocessStatement(x[0] if len(x) else "")
            for x in preprocessed_lines
        ]
        # write canonicalized texts back to the nx graph
        for (node, data), text in zip(g.nodes(data=True), preprocessed_texts):
            if text:
                data["preprocessed_text"] = text
                data[
                    "type"] = programl_pb2.Node.STATEMENT  # thats just int(0).
                #TODO (ZACH) here I changed a line and fixed a bug.
                # data["x"] = [self.dictionary.get(data["text"], self.dictionary["!UNK"])]

                # look up canonicalized statement text in dictionary, o/w !UNK
                data["x"] = [
                    self.dictionary.get(data["preprocessed_text"],
                                        self.dictionary["!UNK"])
                ]
                data["y"] = []
            else:  # cleaned away lines on statements shall be !UNK.
                data["preprocessed_text"] = "!UNK"
                data["type"] = programl_pb2.Node.STATEMENT
                data["x"] = [
                    self.dictionary.get(data["text"], self.dictionary["!UNK"])
                ]
                data["y"] = []
Beispiel #5
0
def EncodeLlvmBytecode(
        bytecode: str,
        vocab: vocabulary.VocabularyZipFile) -> typing.List[int]:
    """Encode an LLVM bytecode to an array of vocabulary indices."""
    bytecode_lines = bytecode.split("\n")
    preprocessed_lines, functions_declared_in_files = preprocess.preprocess(
        [bytecode_lines])
    preprocessed_lines = preprocessed_lines[0]

    # TODO(cec): inline_struct_types_txt

    # Abstract identifiers from statements.
    preprocessed_lines = [
        preprocess.PreprocessStatement(statement)
        for statement in preprocessed_lines
    ]

    # Translate from statement to encoded token.
    return [
        vocab.dictionary.get(statement, vocab.dictionary[rgx.unknown_token])
        for statement in preprocessed_lines if not re.match(
            r"((?:<label>:)?(<LABEL>):|:; <label>:<LABEL>)", statement)
    ]
Beispiel #6
0
    def EncodeNodes(self, g: nx.DiGraph) -> None:
        """Pre-process the node text and set the text embedding index.

    For each node, this sets the 'preprocessed_text' and 'x' attributes.

    Args:
      g: The graph to encode the nodes of.
    """
        lines = [[data["text"]] for _, data in g.nodes(data=True)
                 if data["type"] == programl_pb2.Node.STATEMENT]
        preprocessed_lines, _ = inst2vec_preprocess.preprocess(lines)
        preprocessed_texts = [
            inst2vec_preprocess.PreprocessStatement(x[0] if len(x) else "")
            for x in preprocessed_lines
        ]
        for (node, data), text in zip(g.nodes(data=True), preprocessed_texts):
            if text:
                data["preprocessed_text"] = text
                data["type"] = programl_pb2.Node.STATEMENT
                data["x"] = [self.dictionary.get(data["text"], "!UNK")]
            else:
                data["preprocessed_text"] = "!UNK"
                data["type"] = programl_pb2.Node.STATEMENT
                data["x"] = [self.dictionary.get(data["text"], "!UNK")]
Beispiel #7
0
    def EncodeLlvmBytecode(
        self,
        llvm_bytecode: str,
        options: inst2vec_pb2.EncodeBytecodeOptions = inst2vec_pb2.
        EncodeBytecodeOptions(),
        struct_dict: typing.Dict[str, str] = None,
    ) -> inst2vec_pb2.EncodeBytecodeResult:
        """Encode an LLVM bytecode using the given vocabulary.

    Args:
      llvm_bytecode: LLVM bytecode as a string.
      options: The options to use for encoding.
      struct_dict: The struct rewrite table. If not provided, this is derived
        automatically.

    Returns:
      An EncodeBytecodeResult message.
    """
        result = inst2vec_pb2.EncodeBytecodeResult(
            input_bytecode=llvm_bytecode)

        def _MaybeSetUnknownStatement(stmt: str) -> None:
            if options.set_unknown_statements:
                result.unknown_statements.extend([stmt])

        def _MaybeSetBytecodeAfterPreprocessing(
            preprocessed_lines: typing.List[str], ) -> None:
            if options.set_bytecode_after_preprocessing:
                result.bytecode_after_preprocessing = "\n".join(
                    preprocessed_lines)

        def _MaybeSetStructDict() -> None:
            if options.set_struct_dict:
                for k, v in struct_dict.items():
                    result.struct_dict[k] = v

        llvm_bytecode_lines = llvm_bytecode.split("\n")

        # Get the dictionary of structures defined the file.
        struct_dict = struct_dict or GetStructDict(llvm_bytecode_lines)
        _MaybeSetStructDict()

        # Source code pre-processing.
        # TODO(cec): Merge i2v_prep.preprocess() and PreprocessLlvmBytecode().
        preprocessed_data, _ = i2v_prep.preprocess([llvm_bytecode_lines])
        llvm_bytecode_lines = preprocessed_data[0]
        llvm_bytecode_lines = PreprocessLlvmBytecode(llvm_bytecode_lines,
                                                     struct_dict)
        _MaybeSetBytecodeAfterPreprocessing(llvm_bytecode_lines)

        # Construct indexed sequence.
        encoded = []
        for i, line in enumerate(llvm_bytecode_lines):
            # check whether this is a label, in which case we ignore it
            if re.match(r"((?:<label>:)?(<LABEL>):|; <label>:<LABEL>)", line):
                continue

            # check whether this is an unknown
            if line in self.cutoff_stmts:
                _MaybeSetUnknownStatement(line)
                line = rgx_utils.unknown_token

            # lookup and add to list
            if line not in self.dictionary.keys():
                _MaybeSetUnknownStatement(line)
                line = rgx_utils.unknown_token

            encoded.append(self.dictionary[line])

        result.encoded.extend(encoded)

        return result
Beispiel #8
0
    def Encode(self,
               proto: program_graph_pb2.ProgramGraph,
               ir: Optional[str] = None) -> program_graph_pb2.ProgramGraph:
        """Pre-process the node text and set the text embedding index.

    For each node, this sets 'inst2vec_preprocessed' and 'inst2vec_embedding'
    features.

    Args:
      proto: The ProgramGraph to encode.
      ir: The LLVM IR that was used to construct the graph. This is required for
        struct inlining. If struct inlining is not required, this may be
        omitted.

    Returns:
      The input proto.
    """
        # Gather the instruction texts to pre-process.
        lines = [[NodeFullText(node)] for node in proto.node
                 if node.type == node_pb2.Node.INSTRUCTION]

        if ir:
            # NOTE(github.com/ChrisCummins/ProGraML/issues/57): Extract the struct
            # definitions from the IR and inline their definitions in place of the
            # struct names. These is brittle string substitutions, in the future we
            # should do this inlining in llvm2graph where we have a parsed
            # llvm::Module.
            try:
                structs = inst2vec_preprocess.GetStructTypes(ir)
                for line in lines:
                    for struct, definition in structs.items():
                        line[0] = line[0].replace(struct, definition)
            except ValueError:
                pass

        preprocessed_lines, _ = inst2vec_preprocess.preprocess(lines)
        preprocessed_texts = [
            inst2vec_preprocess.PreprocessStatement(x[0] if len(x) else "")
            for x in preprocessed_lines
        ]

        # Add the node features.
        var_embedding = self.dictionary["!IDENTIFIER"]
        const_embedding = self.dictionary["!IMMEDIATE"]

        text_index = 0
        for node in proto.node:
            if node.type == node_pb2.Node.INSTRUCTION:
                text = preprocessed_texts[text_index].encode("utf-8")
                text_index += 1
                embedding = self.dictionary.get(text, self.dictionary["!UNK"])
                node.features.feature[
                    "inst2vec_preprocessed"].bytes_list.value.append(text)
                node.features.feature[
                    "inst2vec_embedding"].int64_list.value.append(embedding)
            elif node.type == node_pb2.Node.VARIABLE:
                node.features.feature[
                    "inst2vec_embedding"].int64_list.value.append(
                        var_embedding)
            elif node.type == node_pb2.Node.CONSTANT:
                node.features.feature[
                    "inst2vec_embedding"].int64_list.value.append(
                        const_embedding)

        proto.features.feature["inst2vec_annotated"].int64_list.value.append(1)
        return proto