def main(): args = parser.parse_args() print_arguments(args) if args.profile: if args.use_gpu: with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: train(args) else: with profiler.profiler("CPU", sorted_key='total') as cpuprof: train(args) else: train(args)
def main(): args = parse_args() print_arguments(args) # the unique trainer id, starting from 0, needed by trainer # only nccl_id_var, num_trainers, trainer_id = ( None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1"))) if args.use_cprof: pr = cProfile.Profile() pr.enable() model_def = __import__("models.%s" % args.model, fromlist=["models"]) train_args = list(model_def.get_model(args)) train_args.append(args) # Run optimizer.minimize(avg_loss) train_args[2].minimize(train_args[0]) if args.memory_optimize: fluid.memory_optimize(fluid.default_main_program()) if args.update_method == "pserver": train_prog, startup_prog = dist_transpile(trainer_id) if not train_prog: raise Exception( "Must configure correct environments to run dist train.") train_args.extend([train_prog, startup_prog]) if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER": train_args.extend([nccl_id_var, num_trainers, trainer_id]) train_parallel(*train_args) train(*train_args) exit(0) # for other update methods, use default programs train_args.append(fluid.default_main_program()) train_args.append(fluid.default_startup_program()) if args.update_method == "nccl2": nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id) if args.gpus == 1: # NOTE: parallel executor use profiler interanlly if args.use_nvprof and args.device == 'GPU': with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: train(*train_args) else: train(*train_args) else: if args.device == "CPU": raise Exception("Only support GPU perf with parallel exe") train_args.extend([nccl_id_var, num_trainers, trainer_id]) train_parallel(*train_args)
def test_nvprof(self): if not fluid.core.is_compiled_with_cuda(): return epoc = 8 dshape = [4, 3, 28, 28] data = layers.data(name='data', shape=[3, 28, 28], dtype='float32') conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) output_file = 'cuda_profiler.txt' with profiler.cuda_profiler(output_file, 'csv') as nvprof: for i in range(epoc): input = np.random.random(dshape).astype('float32') exe.run(fluid.default_main_program(), feed={'data': input}) os.remove(output_file)
latencies = batch_times[args.skip_pass_num:] latency_avg = np.average(latencies) latency_std = np.std(latencies) latency_pc99 = np.percentile(latencies, 99) wps_avg = np.average(wpses) wps_std = np.std(wpses) wps_pc01 = np.percentile(wpses, 1) # Benchmark output print('\nTotal passes (incl. warm-up): %d' % (total_passes)) print('Total iterations (incl. warm-up): %d' % (all_iters)) print('Total examples (incl. warm-up): %d' % (all_iters * args.batch_size)) print('avg latency: %.5f, std latency: %.5f, 99pc latency: %.5f' % (latency_avg, latency_std, latency_pc99)) print('avg wps: %.5f, std wps: %.5f, wps for 99pc latency: %.5f' % (wps_avg, wps_std, wps_pc01)) if __name__ == "__main__": args = parse_args() print_arguments(args) if args.profile: if args.device == 'GPU': with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: infer(args) else: with profiler.profiler('CPU', sorted_key='total') as cpuprof: infer(args) else: infer(args)