Ejemplo n.º 1
0
def test_VocabularyZipFile_EncodeLlvmBytecode_preprocessing(
    vocab: vocabulary.VocabularyZipFile, ):
    """Test output of pre-processing bytecode."""
    options = inst2vec_pb2.EncodeBytecodeOptions(
        set_bytecode_after_preprocessing=True)
    result = vocab.EncodeLlvmBytecode(FIZZBUZZ_IR, options)

    assert (result.bytecode_after_preprocessing == """\
define i32 <@ID>(i32)
<%ID> = alloca i32, align 4
<%ID> = alloca i32, align 4
store i32 <%ID>, i32* <%ID>, align 4
<%ID> = load i32, i32* <%ID>, align 4
<%ID> = srem i32 <%ID>, <INT>
<%ID> = icmp eq i32 <%ID>, <INT>
br i1 <%ID>, label <%ID>, label <%ID>
; <label>:<LABEL>: ; preds = <LABEL>
store i32 <INT>, i32* <%ID>, align 4
br label <%ID>
; <label>:<LABEL>: ; preds = <LABEL>
store i32 <INT>, i32* <%ID>, align 4
br label <%ID>
; <label>:<LABEL>: ; preds = <LABEL>, <LABEL>
<%ID> = load i32, i32* <%ID>, align 4
ret i32 <%ID>""")
Ejemplo n.º 2
0
def test_VocabularyZipFile_EncodeLlvmBytecode_struct_dict(
    vocab: vocabulary.VocabularyZipFile, ):
    """Test that struct appears in struct_dict."""
    options = inst2vec_pb2.EncodeBytecodeOptions(set_struct_dict=True)
    result = vocab.EncodeLlvmBytecode(BYTECODE_WITH_STRUCT, options)

    assert dict(result.struct_dict) == {"%struct.Foo": "{ i32, i32 }"}
Ejemplo n.º 3
0
def test_VocabularyZipFile_EncodeLlvmBytecode_encode_single_line(
    vocab: vocabulary.VocabularyZipFile, ):
    """Test encoding a single line of bytecode."""
    # A single line of bytecode which references a struct.
    result = vocab.EncodeLlvmBytecode(
        "store %struct.Foo* %0, %struct.Foo** %2, align 8",
        options=inst2vec_pb2.EncodeBytecodeOptions(
            set_bytecode_after_preprocessing=True, ),
        struct_dict={"%struct.Foo": "{ i32, i32 }"},
    )

    assert (result.bytecode_after_preprocessing ==
            "store { i32, i32 }* <%ID>, { i32, i32 }** <%ID>, align 8")
Ejemplo n.º 4
0
def test_VocabularyZipFile_EncodeLlvmBytecode_struct_not_in_preprocessed(
    vocab: vocabulary.VocabularyZipFile, ):
    """Test that struct is inlined during pre-processing."""
    options = inst2vec_pb2.EncodeBytecodeOptions(
        set_bytecode_after_preprocessing=True)
    result = vocab.EncodeLlvmBytecode(BYTECODE_WITH_STRUCT, options)

    assert (result.bytecode_after_preprocessing == """\
define void <@ID>({ i32, i32 }*)
<%ID> = alloca { i32, i32 }*, align 8
store { i32, i32 }* <%ID>, { i32, i32 }** <%ID>, align 8
<%ID> = load { i32, i32 }*, { i32, i32 }** <%ID>, align 8
<%ID> = getelementptr inbounds { i32, i32 }, { i32, i32 }* <%ID>, i32 <INT>, i32 <INT>
<%ID> = load i32, i32* <%ID>, align 4
<%ID> = load { i32, i32 }*, { i32, i32 }** <%ID>, align 8
<%ID> = getelementptr inbounds { i32, i32 }, { i32, i32 }* <%ID>, i32 <INT>, i32 <INT>
<%ID> = load i32, i32* <%ID>, align 4
<%ID> = add nsw i32 <%ID>, <%ID>
store i32 <%ID>, i32* <%ID>, align 4
ret void""")
Ejemplo n.º 5
0
def EncodeGraph(
  graph: llvm_util.LlvmControlFlowGraph,
  vocab: inst2vec_vocabulary.VocabularyZipFile,
  session: tf.compat.v1.Session,
  embedding_lookup_op,
  embedding_lookup_input_ph,
) -> llvm_util.LlvmControlFlowGraph:
  """Encode inst2vec attributes on an LLVM control flow graph.

  For every node in the graph, this adds to keys to the data dictionary:
  'inst2vec_encoded' containing the index into the vocabulary of the node,
  and 'inst2vec' which contains the numpy embedding array.

  Args:
    graph: The graph to encode.
    vocab: The vocabulary to encode.
    embedding_matrix: The embedding matrix.

  Returns:
    The graph.
  """
  # Encode the entire file with debugging options set. We need to process
  # the entire file so that we can get the struct_dict, which we will need
  # when encoding individual nodes. This could be made faster by simply
  # calling `vocab.GetStructDict(graph.graph['llvm_bytecode'].split('\n'))`,
  # but the extra debug information is useful.
  result = vocab.EncodeLlvmBytecode(
    graph.graph["llvm_bytecode"],
    inst2vec_pb2.EncodeBytecodeOptions(
      set_bytecode_after_preprocessing=True,
      set_unknown_statements=True,
      set_struct_dict=True,
    ),
  )

  # if len(result.encoded) != graph.number_of_nodes():
  #   raise ValueError(
  #       f"Encoded bytecode file contains {len(result.encoded)} statements, "
  #       f"but full flow graph contains {graph.number_of_nodes()} nodes. The "
  #       "two should be equal")

  # Protocol buffer maps aren't true dicts and have differing semantics.
  struct_dict = dict(result.struct_dict)

  # Set debug info as global graph attributes.
  graph.graph["num_unknown_statements"] = len(result.unknown_statements)
  graph.graph["struct_dict"] = struct_dict
  graph.graph[
    "llvm_bytecode_preprocessed"
  ] = result.bytecode_after_preprocessing

  for _, data in graph.nodes(data=True):
    bytecode = data["text"]

    # Encode the node's bytecode using the struct dict we derived from the
    # entire file. Since this is a full-flow graph, each instruction's
    # bytecode is a single statement.
    encoded = vocab.EncodeLlvmBytecode(
      bytecode, struct_dict=struct_dict
    ).encoded
    if len(encoded) != 1:
      raise ValueError(
        f"Encoded line `{bytecode}` to {len(encoded)} statements"
      )
    data["inst2vec_encoded"] = encoded[0]

    # Lookup the encoded value in the embedding matrix.
    # TODO(cec): This is a very slow way of doing it. Better would be to
    # collect the encoded values into an array and perform the embedding
    # lookup once.
    sequences = np.array(encoded, dtype=np.int32).reshape((1, 1))
    embedding_vector = session.run(
      embedding_lookup_op, feed_dict={embedding_lookup_input_ph: sequences}
    )
    data["inst2vec"] = embedding_vector[0][0]

  return graph