def compare_impl(full_raw_stack, full_actuals, chunk_raw_stack, chunk_actuals, fancy_func=False, external_trim=None): # cross check both stacks -- make sure they have the same shape cross_check(full_raw_stack, chunk_raw_stack) # build a full tree -- just to get the aten op name root = stack_parser.exec_time_tree(full_raw_stack, fancy_func=fancy_func) # this must be true -- the root has 2 children -- the aten op and other op_name = root.children[0].func # process CPU_log and HB_log # CPU_log should be given by full input data # HB_log should be given by chunk input data cpu_log = process_CPU_stack.parse(full_raw_stack, fancy_func=fancy_func) hb_log = process_HB_stack.parse(chunk_raw_stack, fancy_func=fancy_func, trimming=True) # re-apply trim of 0 if using external tirmming is enabled # since we can have more than one @TRIM@ node, we cannot just adjust these nodes if external_trim is not None: def reset_trim(root): if root.func == "@TRIM@": root.time = float(0) hb_log = traversal(hb_log, reset_trim) stack_parser.exec_time_apply_trim(hb_log) # get total time on device # so we accumulate all simualted time total_time_on_HB = 0 if external_trim is None: def acc_trim(root): nonlocal total_time_on_HB if root.func == "@TRIM@": total_time_on_HB += root.time traversal(hb_log, acc_trim) else: total_time_on_HB = external_trim # debug print(stack_parser.exec_time_print_tree(cpu_log)) print(stack_parser.exec_time_print_tree(hb_log)) print("total time on HB = " + str(total_time_on_HB)) # process input tensors actuals = actual_parser.parse(full_actuals, chunk_actuals) return ATen_OP(op_name, cpu_log, hb_log, total_time_on_HB, actuals)
def test_compare_aten_op_1(): aten_op = compare(current_path + "/demo/full.std", current_path + "/demo/chunk.std", current_path + "/demo/manycore_stats.log") # pytest assertion assert aten_op.hb_device_time == 1204.603 assert aten_op.hb_host_time == 170.0 assert aten_op.xeon_time == 34.451 cpu_graph = stack_parser.exec_time_print_tree(aten_op.cpu_log) assert cpu_graph == "|- Node(@CPU_LOG@ : 34.451)\n |- Node(at::Tensor at::CPUType::{anonymous}::empty(c10::IntArrayRef, const c10::TensorOptions&, c10::optional<c10::MemoryFormat>) : 0.043)\n |- Node(at::native::add_stub::add_stub() : 34.316)" hb_graph = stack_parser.exec_time_print_tree(aten_op.hb_log) assert hb_graph == "|- Node(@HB_LOG@ : 170.0)\n |- Node(at::Tensor at::HammerBladeType::{anonymous}::empty(c10::IntArrayRef, const c10::TensorOptions&, c10::optional<c10::MemoryFormat>) : 0.004)\n |- Node(at::native::add_stub::add_stub() : 170.0)\n |- Node(@OFFLOAD_KERNEL@__tensorlib_add : 0.0)\n |- Node(@TRIM@ : 0.0)"
def test_hb_stack_1(): cpu_log = process_HB_stack.parse(raw_stack) graph = stack_parser.exec_time_print_tree(cpu_log) assert graph == """|- Node(@HB_LOG@ : 0.234)\n |- Node(at::Tensor at::HammerBladeType::{anonymous}::empty(c10::IntArrayRef, const c10::TensorOptions&, c10::optional<c10::MemoryFormat>) : 0.01)\n |- Node(at::native::add_stub::add_stub() : 0.187)\n |- Node(@OFFLOAD_KERNEL@__tensorlib_add : 0.145)\n |- Node(@TRIM@ : 0.0)"""
def test_cpu_stack_1(): cpu_log = process_CPU_stack.parse(raw_stack) graph = stack_parser.exec_time_print_tree(cpu_log) assert graph == """|- Node(@CPU_LOG@ : 0.067)\n |- Node(at::Tensor at::CPUType::{anonymous}::empty(c10::IntArrayRef, const c10::TensorOptions&, c10::optional<c10::MemoryFormat>) : 0.015)\n |- Node(at::native::add_stub::add_stub() : 0.009)"""
def test_stack_parsing_1(): root = stack_parser.exec_time_tree(raw_stack) graph = stack_parser.exec_time_print_tree(root) assert graph == """|- Node(time_in_roi : 0.399)\n |- Node(at::Tensor at::CPUType::{anonymous}::add(const at::Tensor&, const at::Tensor&, c10::Scalar) : 0.399)\n |- Node(@CPU_LOG@ : 0.067)\n |- Node(at::Tensor at::CPUType::{anonymous}::empty(c10::IntArrayRef, const c10::TensorOptions&, c10::optional<c10::MemoryFormat>) : 0.015)\n |- Node(at::native::add_stub::add_stub() : 0.009)\n |- Node(@HB_LOG@ : 0.234)\n |- Node(at::Tensor at::HammerBladeType::{anonymous}::empty(c10::IntArrayRef, const c10::TensorOptions&, c10::optional<c10::MemoryFormat>) : 0.01)\n |- Node(at::native::add_stub::add_stub() : 0.187)\n |- Node(@OFFLOAD_KERNEL@__tensorlib_add : 0.145)\n |- Node(@TRIM@ : 0.0)\n |- Node(at::Tensor at::CPUType::{anonymous}::llcopy(const at::Tensor&) : 0.025)\n |- Node(other : 0.0)"""