def linear_test(self, Module, profiler_output_path=""): D_in = 10 H = 5 D_out = 15 B = 8 NUM_INPUTS = 2 module = Module(D_in, H, D_out) inputs = [] for i in range(NUM_INPUTS): inputs.append([torch.randn(B, D_in), torch.randn(B, D_in)]) bench = ThroughputBenchmark(module) for input in inputs: # can do both args and kwargs here bench.add_input(input[0], x2=input[1]) for i in range(NUM_INPUTS): # or just unpack the list of inputs module_result = module(*inputs[i]) bench_result = bench.run_once(*inputs[i]) torch.testing.assert_close(bench_result, module_result) stats = bench.benchmark( num_calling_threads=4, num_warmup_iters=100, num_iters=1000, profiler_output_path=profiler_output_path, ) print(stats)
def linear_test(self, Module): D_in = 10 H = 5 D_out = 15 B = 8 NUM_INPUTS = 2 module = Module(D_in, H, D_out) inputs = [] for i in range(NUM_INPUTS): inputs.append([torch.randn(B, D_in), torch.randn(B, D_in)]) bench = ThroughputBenchmark(module) for input in inputs: # can do both args and kwargs here bench.add_input(input[0], x2=input[1]) for i in range(NUM_INPUTS): # or just unpack the list of inputs module_result = module(*inputs[i]) bench_result = bench.run_once(*inputs[i]) assert_allclose(bench_result, module_result) stats = bench.benchmark( num_calling_threads=4, num_warmup_iters=100, num_iters=1000, ) print("Avg latency (ms): {}".format(stats.latency_avg_ms)) print("Number of iterations: {}".format(stats.num_iters))
def run_throughput_benchmark(args, dlrm, test_ld): bench = ThroughputBenchmark(dlrm) for j, inputBatch in enumerate(test_ld): X, lS_o, lS_i, T, W, CBPP = unpack_batch(inputBatch) bench.add_input(X, lS_o, lS_i) if j == 1000: break stats = bench.benchmark( num_calling_threads=args.share_weight_instance, num_warmup_iters=100, num_iters=args.num_batches * args.share_weight_instance, ) print(stats) latency = stats.latency_avg_ms throughput = ( 1 / latency) * 1000 * args.mini_batch_size * args.share_weight_instance print("Throughput: {:.3f} fps".format(throughput)) sys.exit()
print("time/loss/accuracy (if enabled):") with torch.autograd.profiler.profile(args.enable_profiling, use_gpu) as prof: if args.share_weight: data = train_loader.sampler.data_source for j, (X, lS_o, lS_i, T) in enumerate(train_loader): traced_model = torch.jit.trace(dlrm, (X, lS_o, lS_i), check_trace=False) break bench = ThroughputBenchmark(traced_model) j = 0 for j, (X, lS_o, lS_i, T) in enumerate(train_loader): bench.add_input(X, lS_o, lS_i) stats = bench.benchmark( num_calling_threads=args.num_instance, num_warmup_iters=100, num_iters=900 * args.num_instance, ) print(stats) else: while k < args.nepochs: accum_time_begin = time_wrap(use_gpu) for j, (X, lS_o, lS_i, T) in enumerate(train_loader): # discard calibration data if args.do_int8_inference and j < nbatches * 0.05: continue # early exit if nbatches was set by the user and has been exceeded if j >= nbatches: break ''' # debug prints
def benchmark_using_throughput_benchmark(config, module): print("Benchmarking via ThroughputBenchmark") bench = ThroughputBenchmark(module.module) bench.add_input(*module.tensor_inputs) stats = bench.benchmark(1, config.num_warmup_iters, config.num_iters) return stats.latency_avg_ms / NUM_LOOP_ITERS