def test_add_tree_1():
    root1 = stack_parser.exec_time_tree(raw_stack)
    root2 = stack_parser.exec_time_tree(raw_stack)
    sum_tree = add_tree(root1, root2)
    graph1 = draw_tree(root1)
    graph2 = draw_tree(root2)
    graph = draw_tree(sum_tree)
    assert graph1 == graph2
    assert graph1 == """
digraph {
0 [ shape=record label = "time_in_roi|0.40"];
1 [ shape=record label = "add|0.40"];
2 [ shape=record label = "@CPU_LOG@|0.07"];
3 [ shape=record label = "empty|0.01"];
4 [ shape=record label = "add_stub|0.01"];
5 [ shape=record label = "@HB_LOG@|0.23"];
6 [ shape=record label = "empty|0.01"];
7 [ shape=record label = "add_stub|0.19"];
8 [ shape=record label = "@OFFLOAD_KERNEL@__tensorlib_add|0.14"];
10 [ shape=record label = "llcopy|0.03"];
11 [ shape=record label = "other|0.00"];
0 -> 1;
0 -> 11;
1 -> 2;
1 -> 5;
1 -> 10;
2 -> 3;
2 -> 4;
5 -> 6;
5 -> 7;
7 -> 8;
    assert graph == """
def compare_impl(full_raw_stack,

    # cross check both stacks -- make sure they have the same shape
    cross_check(full_raw_stack, chunk_raw_stack)

    # build a full tree -- just to get the aten op name
    root = stack_parser.exec_time_tree(full_raw_stack, fancy_func=fancy_func)
    # this must be true -- the root has 2 children -- the aten op and other
    op_name = root.children[0].func

    # process CPU_log and HB_log
    # CPU_log should be given by full input data
    # HB_log should be given by chunk input data
    cpu_log = process_CPU_stack.parse(full_raw_stack, fancy_func=fancy_func)
    hb_log = process_HB_stack.parse(chunk_raw_stack,

    # re-apply trim of 0 if using external tirmming is enabled
    # since we can have more than one @TRIM@ node, we cannot just adjust these nodes
    if external_trim is not None:

        def reset_trim(root):
            if root.func == "@TRIM@":
                root.time = float(0)

        hb_log = traversal(hb_log, reset_trim)

    # get total time on device
    # so we accumulate all simualted time
    total_time_on_HB = 0
    if external_trim is None:

        def acc_trim(root):
            nonlocal total_time_on_HB
            if root.func == "@TRIM@":
                total_time_on_HB += root.time

        traversal(hb_log, acc_trim)
        total_time_on_HB = external_trim

    # debug
    print("total time on HB = " + str(total_time_on_HB))

    # process input tensors
    actuals = actual_parser.parse(full_actuals, chunk_actuals)

    return ATen_OP(op_name, cpu_log, hb_log, total_time_on_HB, actuals)
def cross_check(full_raw_stack, chunk_raw_stack):
    full_tree = stack_parser.exec_time_tree(full_raw_stack)
    chunk_tree = stack_parser.exec_time_tree(chunk_raw_stack)
    full_func_list = []

    def gather_full_func(root):

    chunk_func_list = []

    def gather_chunk_func(root):

    traversal(full_tree, gather_full_func)
    traversal(chunk_tree, gather_chunk_func)
    assert len(full_func_list) == len(chunk_func_list)
    full_func = "<|>".join(full_func_list)
    chunk_func = "<|>".join(chunk_func_list)
    assert full_func == chunk_func
def parse(raw_stack, fancy_func=False):
    root = stack_parser.exec_time_tree(raw_stack, fancy_func=fancy_func)
    # in the per ATen OP context, the tree looks like
    #            root
    #        /          \
    #   aten::op      other
    assert len(root.children) == 2
    assert root.children[1].func == "other"
    aten_op = root.children[0]
    # look for @CPU_LOG@
    # there has to be a CPU_log to use this parser
    # the aten_op tree should look like
    #            aten::op
    #      /        |        \
    #   CPU_log  ******   HB_log
    cpu_log = None
    for child in aten_op.children:
        if child.func == "@CPU_LOG@":
            cpu_log = child
    assert cpu_log is not None
    return cpu_log
def test_stack_parsing_1():
    root = stack_parser.exec_time_tree(raw_stack)
    graph = stack_parser.exec_time_print_tree(root)
    assert graph == """|- Node(time_in_roi : 0.399)\n  |- Node(at::Tensor at::CPUType::{anonymous}::add(const at::Tensor&, const at::Tensor&, c10::Scalar) : 0.399)\n    |- Node(@CPU_LOG@ : 0.067)\n      |- Node(at::Tensor at::CPUType::{anonymous}::empty(c10::IntArrayRef, const c10::TensorOptions&, c10::optional<c10::MemoryFormat>) : 0.015)\n      |- Node(at::native::add_stub::add_stub() : 0.009)\n    |- Node(@HB_LOG@ : 0.234)\n      |- Node(at::Tensor at::HammerBladeType::{anonymous}::empty(c10::IntArrayRef, const c10::TensorOptions&, c10::optional<c10::MemoryFormat>) : 0.01)\n      |- Node(at::native::add_stub::add_stub() : 0.187)\n        |- Node(@OFFLOAD_KERNEL@__tensorlib_add : 0.145)\n          |- Node(@TRIM@ : 0.0)\n    |- Node(at::Tensor at::CPUType::{anonymous}::llcopy(const at::Tensor&) : 0.025)\n  |- Node(other : 0.0)"""