Exemple #1
0
def test_VocabularyZipFile_EncodeLlvmBytecode_struct_dict(
    vocab: vocabulary.VocabularyZipFile, ):
    """Test that struct appears in struct_dict."""
    options = inst2vec_pb2.EncodeBytecodeOptions(set_struct_dict=True)
    result = vocab.EncodeLlvmBytecode(BYTECODE_WITH_STRUCT, options)

    assert dict(result.struct_dict) == {"%struct.Foo": "{ i32, i32 }"}
Exemple #2
0
def test_VocabularyZipFile_EncodeLlvmBytecode_preprocessing(
    vocab: vocabulary.VocabularyZipFile, ):
    """Test output of pre-processing bytecode."""
    options = inst2vec_pb2.EncodeBytecodeOptions(
        set_bytecode_after_preprocessing=True)
    result = vocab.EncodeLlvmBytecode(FIZZBUZZ_IR, options)

    assert (result.bytecode_after_preprocessing == """\
define i32 <@ID>(i32)
<%ID> = alloca i32, align 4
<%ID> = alloca i32, align 4
store i32 <%ID>, i32* <%ID>, align 4
<%ID> = load i32, i32* <%ID>, align 4
<%ID> = srem i32 <%ID>, <INT>
<%ID> = icmp eq i32 <%ID>, <INT>
br i1 <%ID>, label <%ID>, label <%ID>
; <label>:<LABEL>: ; preds = <LABEL>
store i32 <INT>, i32* <%ID>, align 4
br label <%ID>
; <label>:<LABEL>: ; preds = <LABEL>
store i32 <INT>, i32* <%ID>, align 4
br label <%ID>
; <label>:<LABEL>: ; preds = <LABEL>, <LABEL>
<%ID> = load i32, i32* <%ID>, align 4
ret i32 <%ID>""")
Exemple #3
0
def EncodeAndPadSourcesWithInst2Vec(
    df: pd.DataFrame,
    vocab: inst2vec_vocabulary.VocabularyZipFile,
    datafolder: pathlib.Path,
    max_sequence_len: typing.Optional[int] = None,
) -> typing.Tuple[np.array, int]:
    """Encode and pad source code using inst2vec translation."""
    sequence_lengths = []
    sequences = []

    # A map from source files to encoded sequences, as there can be multiple
    # entries in the dataframe using the same sequence.
    src_path_to_sequence = {}

    src_paths = list(
        set(
            DataFrameRowToKernelSrcPath(row, datafolder)
            for _, row in df.iterrows()))

    # Chunk the srcs and process in parallel.
    srcs_per_process = 16
    encode_args = [(src_paths[i:i + srcs_per_process], datafolder)
                   for i in range(0, len(src_paths), srcs_per_process)]
    batches = multiprocessing.Pool().starmap(_EncodeSourceBatchOrDie,
                                             encode_args)
    for batch in batches:
        for src_file_path, bytecode in batch:
            app.Log(2, "Encoding %s", src_file_path.name)
            sequence = list(vocab.EncodeLlvmBytecode(bytecode).encoded)
            src_path_to_sequence[src_file_path] = sequence

    for _, row in df.iterrows():
        src_file_path = DataFrameRowToKernelSrcPath(row, datafolder)
        sequence = src_path_to_sequence[src_file_path]

        sequence_lengths.append(len(sequence))
        sequences.append(sequence)

    if max_sequence_len is None:
        max_sequence_len = max(sequence_lengths)
    app.Log(
        2,
        "Sequence lengths: min=%d, avg=%.2f, max=%d",
        min(sequence_lengths),
        np.mean(sequence_lengths),
        max_sequence_len,
    )

    encoded = np.array(
        keras_sequence.pad_sequences(sequences,
                                     maxlen=max_sequence_len,
                                     value=vocab.unknown_token_index))
    encoded = np.vstack([np.expand_dims(x, axis=0) for x in encoded])

    return encoded, max_sequence_len
Exemple #4
0
def test_VocabularyZipFile_EncodeLlvmBytecode_encode_single_line(
    vocab: vocabulary.VocabularyZipFile, ):
    """Test encoding a single line of bytecode."""
    # A single line of bytecode which references a struct.
    result = vocab.EncodeLlvmBytecode(
        "store %struct.Foo* %0, %struct.Foo** %2, align 8",
        options=inst2vec_pb2.EncodeBytecodeOptions(
            set_bytecode_after_preprocessing=True, ),
        struct_dict={"%struct.Foo": "{ i32, i32 }"},
    )

    assert (result.bytecode_after_preprocessing ==
            "store { i32, i32 }* <%ID>, { i32, i32 }** <%ID>, align 8")
Exemple #5
0
def test_VocabularyZipFile_EncodeLlvmBytecode_struct_not_in_preprocessed(
    vocab: vocabulary.VocabularyZipFile, ):
    """Test that struct is inlined during pre-processing."""
    options = inst2vec_pb2.EncodeBytecodeOptions(
        set_bytecode_after_preprocessing=True)
    result = vocab.EncodeLlvmBytecode(BYTECODE_WITH_STRUCT, options)

    assert (result.bytecode_after_preprocessing == """\
define void <@ID>({ i32, i32 }*)
<%ID> = alloca { i32, i32 }*, align 8
store { i32, i32 }* <%ID>, { i32, i32 }** <%ID>, align 8
<%ID> = load { i32, i32 }*, { i32, i32 }** <%ID>, align 8
<%ID> = getelementptr inbounds { i32, i32 }, { i32, i32 }* <%ID>, i32 <INT>, i32 <INT>
<%ID> = load i32, i32* <%ID>, align 4
<%ID> = load { i32, i32 }*, { i32, i32 }** <%ID>, align 8
<%ID> = getelementptr inbounds { i32, i32 }, { i32, i32 }* <%ID>, i32 <INT>, i32 <INT>
<%ID> = load i32, i32* <%ID>, align 4
<%ID> = add nsw i32 <%ID>, <%ID>
store i32 <%ID>, i32* <%ID>, align 4
ret void""")
Exemple #6
0
def test_VocabularyZipFile_EncodeLlvmBytecode_sequence(
    vocab: vocabulary.VocabularyZipFile, ):
    # Function contains 14 statements.
    result = vocab.EncodeLlvmBytecode(FIZZBUZZ_IR)
    assert len(result.encoded) == 14
Exemple #7
0
def test_VocabularyZipFile_EncodeLlvmBytecode_bytecode(
    vocab: vocabulary.VocabularyZipFile, ):
    """Test that bytecode is set in return value."""
    result = vocab.EncodeLlvmBytecode(FIZZBUZZ_IR)
    assert result.input_bytecode == FIZZBUZZ_IR
Exemple #8
0
def CreateSeqDirFromIr(folder_ir: str,
                       vocab: vocabulary.VocabularyZipFile) -> str:
    """Transform a folder of raw IR into trainable data to be used as input data
  in tasks.

  Args:
    folder_ir: The folder of LLVM IR to read. Must end in '_ir'.
    vocab: The vocabulary to use to encode IR.

  Returns:
    The path of the folder of sequences, ending in '_seq'.
  """
    # Setup
    assert folder_ir, "Please specify a folder containing the raw LLVM IR"
    assert os.path.exists(folder_ir), "Folder not found: " + folder_ir
    folder_seq = re.sub("_ir$", "_seq", folder_ir)
    if folder_seq:
        app.Log(1, "Preparing to write LLVM IR index sequences to %s",
                folder_seq)
        if not os.path.exists(folder_seq):
            os.makedirs(folder_seq)

    # Get sub-folders if there are any
    listing = os.listdir(folder_ir + "/")
    folders_ir = list()
    folders_seq = list()
    found_subfolder = False
    for path in listing:
        if os.path.isdir(os.path.join(folder_ir, path)):
            folders_ir.append(os.path.join(folder_ir, path))
            folders_seq.append(os.path.join(folder_seq, path))
            found_subfolder = True
    if found_subfolder:
        app.Log(1, "Found %d subfolders", len(folders_ir))
    else:
        app.Log(1, "No subfolders found in %s", folder_ir)
        folders_ir = [folder_ir]
        folders_seq = [folder_seq]

    # Loop over sub-folders
    for i, raw_ir_folder in enumerate(folders_ir):

        l = folders_seq[i] + "/"
        if not os.path.exists(l) or not os.listdir(l):
            # Read data from folder
            raw_data, file_names = read_data_files_from_folder(raw_ir_folder)

            # Write indexed sequence of statements
            seq_folder = folders_seq[i]
            if not os.path.exists(seq_folder):
                os.makedirs(seq_folder)

            # Write indexed sequence of statements to files.
            for i, file in enumerate(raw_data):
                result = vocab.EncodeLlvmBytecode(file)

                # Write to csv
                file_name_csv = os.path.join(seq_folder,
                                             file_names[i][:-3] + "_seq.csv")
                file_name_rec = os.path.join(seq_folder,
                                             file_names[i][:-3] + "_seq.rec")
                with open(file_name_csv,
                          "w") as csv, open(file_name_rec, "wb") as rec:
                    for ind in result.encoded:
                        csv.write(str(ind) + "\n")
                        rec.write(struct.pack("I", int(ind)))

    return folder_seq
Exemple #9
0
def EncodeGraph(
  graph: llvm_util.LlvmControlFlowGraph,
  vocab: inst2vec_vocabulary.VocabularyZipFile,
  session: tf.compat.v1.Session,
  embedding_lookup_op,
  embedding_lookup_input_ph,
) -> llvm_util.LlvmControlFlowGraph:
  """Encode inst2vec attributes on an LLVM control flow graph.

  For every node in the graph, this adds to keys to the data dictionary:
  'inst2vec_encoded' containing the index into the vocabulary of the node,
  and 'inst2vec' which contains the numpy embedding array.

  Args:
    graph: The graph to encode.
    vocab: The vocabulary to encode.
    embedding_matrix: The embedding matrix.

  Returns:
    The graph.
  """
  # Encode the entire file with debugging options set. We need to process
  # the entire file so that we can get the struct_dict, which we will need
  # when encoding individual nodes. This could be made faster by simply
  # calling `vocab.GetStructDict(graph.graph['llvm_bytecode'].split('\n'))`,
  # but the extra debug information is useful.
  result = vocab.EncodeLlvmBytecode(
    graph.graph["llvm_bytecode"],
    inst2vec_pb2.EncodeBytecodeOptions(
      set_bytecode_after_preprocessing=True,
      set_unknown_statements=True,
      set_struct_dict=True,
    ),
  )

  # if len(result.encoded) != graph.number_of_nodes():
  #   raise ValueError(
  #       f"Encoded bytecode file contains {len(result.encoded)} statements, "
  #       f"but full flow graph contains {graph.number_of_nodes()} nodes. The "
  #       "two should be equal")

  # Protocol buffer maps aren't true dicts and have differing semantics.
  struct_dict = dict(result.struct_dict)

  # Set debug info as global graph attributes.
  graph.graph["num_unknown_statements"] = len(result.unknown_statements)
  graph.graph["struct_dict"] = struct_dict
  graph.graph[
    "llvm_bytecode_preprocessed"
  ] = result.bytecode_after_preprocessing

  for _, data in graph.nodes(data=True):
    bytecode = data["text"]

    # Encode the node's bytecode using the struct dict we derived from the
    # entire file. Since this is a full-flow graph, each instruction's
    # bytecode is a single statement.
    encoded = vocab.EncodeLlvmBytecode(
      bytecode, struct_dict=struct_dict
    ).encoded
    if len(encoded) != 1:
      raise ValueError(
        f"Encoded line `{bytecode}` to {len(encoded)} statements"
      )
    data["inst2vec_encoded"] = encoded[0]

    # Lookup the encoded value in the embedding matrix.
    # TODO(cec): This is a very slow way of doing it. Better would be to
    # collect the encoded values into an array and perform the embedding
    # lookup once.
    sequences = np.array(encoded, dtype=np.int32).reshape((1, 1))
    embedding_vector = session.run(
      embedding_lookup_op, feed_dict={embedding_lookup_input_ph: sequences}
    )
    data["inst2vec"] = embedding_vector[0][0]

  return graph