Exemple #1
0
    def test_execution_graph_start_stop(self):
        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities(
        )
        # Create a temp file to save execution graph data.
        fp = tempfile.NamedTemporaryFile('w+t', suffix='.json', delete=False)
        fp.close()
        expected_loop_events = 0
        eg = ExecutionGraphObserver()
        eg.register_callback(fp.name)
        for idx in range(10):
            if idx == 3:
                eg.start()
            elif idx == 5:
                eg.stop()
            elif idx == 8:
                eg.start()
            elif idx == 9:
                eg.stop()
                eg.unregister_callback()
            if eg._execution_graph_running:
                expected_loop_events += 1
            with record_function(f"## LOOP {idx} ##"):
                self.payload(use_cuda=use_cuda)

        assert fp.name == eg.get_output_file_path()
        nodes = self.get_execution_graph_root(fp.name)
        loop_count = 0
        for n in nodes:
            assert "name" in n
            if "[pytorch|profiler|execution_graph|process]" in n["name"]:
                found_root_node = True
            if n["name"].startswith("## LOOP "):
                loop_count += 1
        assert found_root_node
        assert loop_count == expected_loop_events
Exemple #2
0
    def test_execution_graph_no_capture(self):
        fp = tempfile.NamedTemporaryFile('w+t', suffix='.json', delete=False)
        fp.close()
        eg = ExecutionGraphObserver()
        eg.register_callback(fp.name)
        eg.unregister_callback()

        assert fp.name == eg.get_output_file_path()
        nodes = self.get_execution_graph_root(fp.name)
        for n in nodes:
            assert "name" in n
            if "[pytorch|profiler|execution_graph|process]" in n["name"]:
                found_root_node = True
        assert found_root_node
Exemple #3
0
    def test_execution_graph_with_kineto(self):
        trace_called_num = 0

        def trace_handler(p):
            nonlocal trace_called_num
            trace_called_num += 1

        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities(
        )
        # Create a temp file to save execution graph data.
        fp = tempfile.NamedTemporaryFile('w+t', suffix='.json', delete=False)
        fp.close()
        expected_loop_events = 0
        eg = ExecutionGraphObserver()
        eg.register_callback(fp.name)
        with profile(
                activities=supported_activities(),
                schedule=torch.profiler.schedule(skip_first=3,
                                                 wait=1,
                                                 warmup=1,
                                                 active=2),
                on_trace_ready=trace_handler,
        ) as p:
            eg.start()
            for idx in range(10):
                expected_loop_events += 1
                with record_function(f"## LOOP {idx} ##"):
                    self.payload(use_cuda=use_cuda)
                p.step()
            eg.stop()

        eg.unregister_callback()

        assert trace_called_num == 2
        assert fp.name == eg.get_output_file_path()
        nodes = self.get_execution_graph_root(fp.name)
        loop_count = 0
        for n in nodes:
            assert "name" in n
            if "[pytorch|profiler|execution_graph|process]" in n["name"]:
                found_root_node = True
            if n["name"].startswith("## LOOP "):
                loop_count += 1
        assert found_root_node
        assert loop_count == expected_loop_events
Exemple #4
0
def main():
    parser = argparse.ArgumentParser(description="PyTorch Microbenchmarks")
    parser.add_argument("-c", "--config", type=str, help="The benchmark config file.")
    parser.add_argument(
        "-w", "--warmup", type=int, default=1, help="Number of warm up iterations."
    )
    parser.add_argument(
        "-i", "--iteration", type=int, default=1, help="Number of benchmark iterations."
    )
    parser.add_argument(
        "-b", "--backward", action="store_true", help="Include backward pass."
    )
    parser.add_argument(
        "-d", "--device", type=str, default="cpu", help="Target device for benchmark."
    )
    parser.add_argument(
        "-o",
        "--output-prefix",
        type=str,
        default="benchmark_result",
        help="File name prefix to write benchmark results.",
    )
    parser.add_argument(
        "-r",
        "--resume-id",
        type=str,
        default=None,
        help="Define a resume op_run_id to continue benchmark, skip all previous configs.",
    )
    parser.add_argument(
        "-s",
        "--stop_id",
        type=str,
        default=None,
        help="Define a stop op_run_id (exclusive) to stop benchmark, skip remaining configs.",
    )
    parser.add_argument(
        "-a",
        "--append",
        action="store_true",
        help="Append to output file, rather than overwrite.",
    )
    parser.add_argument(
        "--cuda-l2-cache",
        default="off",
        nargs="?",
        choices=["on", "off"],
        help="Set option for CUDA GPU L2 cache between iterations in discrete mode.",
    )
    parser.add_argument(
        "--ncu", action="store_true", help="Run NSight Compute to collect metrics."
    )
    parser.add_argument(
        "--ncu-bin",
        type=str,
        default=None,
        help="Path to the NSight Compute (ncu) binary.",
    )
    parser.add_argument(
        "--ncu-args-file",
        type=str,
        default=None,
        help="NSight Compute extra command line options (metrics etc.).",
    )
    parser.add_argument(
        "--ncu-warmup",
        type=int,
        default=None,
        help="NSight Systems number of warmup runs.",
    )
    parser.add_argument(
        "--ncu-iteration",
        type=int,
        default=None,
        help="NSight Systems number of measured iteration runs.",
    )
    parser.add_argument(
        "--nsys", action="store_true", help="Run NSight Systems to collect metrics."
    )
    parser.add_argument(
        "--nsys-bin",
        type=str,
        default=None,
        help="Path to the NSight Systems (nsys) binary.",
    )
    parser.add_argument(
        "--nsys-args-file",
        type=str,
        default=None,
        help="NSight Systems extra command line options (metrics etc.).",
    )
    parser.add_argument(
        "--nsys-warmup",
        type=int,
        default=None,
        help="NSight Systems number of warmup runs.",
    )
    parser.add_argument(
        "--nsys-iteration",
        type=int,
        default=None,
        help="NSight Systems number of measured iteration runs.",
    )
    parser.add_argument(
        "--run-batch-size",
        type=int,
        default=50,
        help="Batch run input size (number of input configs to run in one launch), used by both NCU and NSYS.",
    )
    parser.add_argument(
        "--batch-cuda-device",
        type=int,
        default=1,
        help="CUDA GPU device ID to run batch job.",
    )
    parser.add_argument(
        "--batch-cmd",
        type=str,
        default=None,
        help="Run batch job command.",
    )
    parser.add_argument(
        "--exec-mode",
        type=str,
        default="discrete",
        nargs="?",
        choices=["discrete", "continuous", "continuous_events"],
        help="Set execution mode of the operators (discrete, continuous, continuous_events). Default=discrete",
    )
    parser.add_argument(
        "-p",
        "--profile",
        action="store_true",
        help="Enable profiler and tracing.",
    )
    parser.add_argument(
        "--eg",
        action="store_true",
        help="Collect execution graph.",
    )

    parser.add_argument(
        "-l", "--log-level", default="INFO", help="Log output verbosity."
    )
    parser.add_argument("--version", action="store_true", help="Print version.")

    args = parser.parse_args()

    logger = init_logging(getattr(logging, args.log_level.upper(), logging.INFO))

    if args.version:
        logger.info(f"PARAM train compute version: {__version__}")
        return
    elif not args.config:
        parser.print_usage()
        return

    # Load PyTorch implementations for data generator and operators.
    load_modules(lib_pytorch)

    # Load PyTorch operator workloads.
    load_modules(workloads_pytorch)

    run_options = get_benchmark_options()
    run_options["warmup"] = args.warmup
    run_options["iteration"] = args.iteration
    run_options["device"] = args.device
    run_options["cuda_l2_cache"] = args.cuda_l2_cache == "on"
    run_options["resume_op_run_id"] = args.resume_id
    run_options["stop_op_run_id"] = args.stop_id
    run_options["run_batch_size"] = args.run_batch_size
    run_options["batch_cuda_device"] = args.batch_cuda_device

    if args.backward:
        run_options["pass_type"] = ExecutionPass.BACKWARD
    else:
        run_options["pass_type"] = ExecutionPass.FORWARD

    run_options["op_exec_mode"] = OpExecutionMode(args.exec_mode)
    run_options["run_ncu"] = args.ncu
    run_options["run_nsys"] = args.nsys

    pid = os.getpid()

    start_time = datetime.now()
    timestamp = int(datetime.timestamp(start_time))

    out_file_prefix = f"{args.output_prefix}_{pid}_{timestamp}"
    out_file_name = f"{out_file_prefix}.json"

    write_option = "a" if args.append else "w"

    if args.batch_cmd:
        run_options["batch_cmd"] = args.batch_cmd

    if args.ncu_bin:
        run_options["ncu_bin"] = args.ncu_bin
    if args.ncu_warmup:
        run_options["ncu_warmup"] = args.ncu_warmup
    if args.ncu_iteration:
        run_options["ncu_iteration"] = args.ncu_iteration
    if args.ncu_args_file:
        with open(args.ncu_args_file, "r") as ncu_file:
            run_options["ncu_args"] = ncu_file.read().strip()

    if args.nsys_bin:
        run_options["nsys_bin"] = args.nsys_bin
    if args.nsys_warmup:
        run_options["nsys_warmup"] = args.nsys_warmup
    if args.nsys_iteration:
        run_options["nsys_iteration"] = args.nsys_iteration
    if args.nsys_args_file:
        with open(args.nsys_args_file, "r") as nsys_file:
            run_options["nsys_args"] = nsys_file.read().strip()

    run_options["cmd_args"] = args.__dict__

    with open(out_file_name, write_option) as out_file:
        run_options["out_file_prefix"] = args.output_prefix
        run_options["out_stream"] = out_file
        benchmark_setup = {
            "run_options": run_options,
            "sys_info": get_sys_info(),
            "start_time": start_time.isoformat(timespec="seconds"),
        }
        print(json.dumps(benchmark_setup, default=str), file=out_file)

        bench_config = BenchmarkConfig(run_options)
        bench_config.load_json_file(args.config)
        benchmark = make_default_benchmark(bench_config)
        use_cuda = False
        if run_options["device"].startswith("cuda"):
            use_cuda = True

        eg = None
        if args.eg:
            eg_file = f"{out_file_prefix}_eg.json"
            eg = ExecutionGraphObserver()
            eg.register_callback(eg_file)
            eg.start()

        with torch.autograd.profiler.profile(
            args.profile, use_cuda=use_cuda, use_kineto=True, record_shapes=False
        ) as prof:
            with record_function(f"[param|{run_options['device']}]"):
                benchmark.run()

        if eg:
            eg.stop()
            eg.unregister_callback()
            logger.info(f"exeution graph: {eg_file}")

        print(
            json.dumps({"finish_time": datetime.now().isoformat(timespec="seconds")}),
            file=out_file,
        )
        if args.profile and prof:
            trace_file = f"{out_file_prefix}_trace.json"
            logger.info(f"trace: {trace_file}")
            prof.export_chrome_trace(trace_file)
            print(json.dumps({"trace_file": trace_file}), file=out_file)

    logger.info(f"benchmark result: {out_file_name}")
Exemple #5
0
    def benchTime(self):
        self.preprocess_graph()
        print("Start to execution: ")
        time.sleep(10)
        total_time = 0.0
        event_1 = torch.cuda.Event(enable_timing=True)
        event_2 = torch.cuda.Event(enable_timing=True)

        eg_file = "/tmp/replay_eg.json"
        eg = ExecutionGraphObserver()
        eg.register_callback(eg_file)

        if self.profile_replay:
            with torch.profiler.profile(
                activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA,
                ],
                record_shapes=True,
                # schedule=torch.profiler.schedule(
                #     skip_first=10,
                #     wait=10,
                #     warmup=10,
                #     active=10,
                # ),
                on_trace_ready=trace_handler,
                # profile_memory=True,
            ) as prof:
                for iter in range(self.numWarmupIters + self.numIters):
                    if iter == self.numWarmupIters:
                        eg.start()
                    if iter == self.numWarmupIters + 1:
                        eg.stop()
                        eg.unregister_callback()
                    event_1.record()
                    for node in self.sorted_nodes:
                        self.run_op(node)
                    event_2.record()
                    torch.cuda.synchronize()
                    if iter >= self.numWarmupIters:
                        total_time += event_1.elapsed_time(event_2)
                    # Comment out this for now since it will introduce additional cudaMalloc
                    # self.reset_registry()
                    prof.step()
                    # print(iter, torch.cuda.memory_allocated(self.cuda))
            # print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=20))
        else:
            for iter in range(self.numWarmupIters + self.numIters):
                event_1.record()
                for node in self.sorted_nodes:
                    self.run_op(node)
                event_2.record()
                torch.cuda.synchronize()
                if iter >= self.numWarmupIters:
                    total_time += event_1.elapsed_time(event_2)
                # Comment out this for now since it will introduce additional cudaMalloc
                # self.reset_registry()

        if self.profile_memory:
            print("Allocated GPU memory(B):")
            for node in dict(sorted(self.op_allocated_mem.items(), key=lambda item: item[1], reverse=True)[:100]):
                print(node.id, self.op_allocated_mem[node])
            print("Reserved GPU memory(B):")
            for node in dict(sorted(self.op_reserved_mem.items(), key=lambda item: item[1], reverse=True)[:100]):
                print(node.id, self.op_reserved_mem[node])