def train_parallel(args): train_prog = fluid.Program() test_prog = fluid.Program() startup_prog = fluid.Program() train_pyreader, train_cost, train_acc1, train_acc5 = build_program( True, train_prog, startup_prog, args) test_pyreader, test_cost, test_acc1, test_acc5 = build_program( False, test_prog, startup_prog, args) if args.update_method == "pserver": train_prog, startup_prog = pserver_prepare(args, train_prog, startup_prog) elif args.update_method == "nccl2": nccl2_prepare(args, startup_prog) if args.dist_env["training_role"] == "PSERVER": run_pserver(train_prog, startup_prog) exit(0) if args.use_gpu: # NOTE: for multi process mode: one process per GPU device. gpu_id = 0 if os.getenv("FLAGS_selected_gpus"): gpu_id = int(os.getenv("FLAGS_selected_gpus")) place = core.CUDAPlace(gpu_id) if args.use_gpu else core.CPUPlace() startup_exe = fluid.Executor(place) if args.multi_batch_repeat > 1: append_bn_repeat_init_op(train_prog, startup_prog, args.multi_batch_repeat) startup_exe.run(startup_prog) if args.checkpoint: fluid.io.load_persistables(startup_exe, args.checkpoint, main_program=train_prog) strategy = fluid.ExecutionStrategy() strategy.num_threads = args.num_threads build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = False build_strategy.memory_optimize = False build_strategy.enable_sequential_execution = bool( args.enable_sequential_execution) if args.reduce_strategy == "reduce": build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.Reduce else: build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.AllReduce if args.update_method == "pserver" or args.update_method == "local": # parameter server mode distributed training, merge # gradients on local server, do not initialize # ParallelExecutor with multi server all-reduce mode. num_trainers = 1 trainer_id = 0 else: num_trainers = args.dist_env["num_trainers"] trainer_id = args.dist_env["trainer_id"] # Set this to let build_strategy to add "allreduce_deps_pass" automatically build_strategy.num_trainers = num_trainers build_strategy.trainer_id = trainer_id if args.multi_batch_repeat > 1: pass_builder = build_strategy._finalize_strategy_and_create_passes() mypass = pass_builder.insert_pass( len(pass_builder.all_passes()) - 4, "multi_batch_merge_pass") mypass.set("num_repeats", args.multi_batch_repeat) exe = fluid.ParallelExecutor(True, train_cost.name, main_program=train_prog, exec_strategy=strategy, build_strategy=build_strategy, num_trainers=num_trainers, trainer_id=trainer_id) # Uncomment below lines to use ParallelExecutor to run test. # test_exe = fluid.ParallelExecutor( # True, # main_program=test_prog, # share_vars_from=exe, # scope=fluid.global_scope().new_scope() # ) over_all_start = time.time() fetch_list = [train_cost.name, train_acc1.name, train_acc5.name] steps_per_pass = args.total_images / args.batch_size / args.dist_env[ "num_trainers"] for pass_id in range(args.num_epochs): num_samples = 0 start_time = time.time() batch_id = 1 # use pass_id+1 as per pass global shuffle for distributed training prepare_reader(True, train_pyreader, args, pass_id + 1) train_pyreader.start() while True: try: if batch_id % 30 == 0: fetch_ret = exe.run(fetch_list) fetched_data = [np.mean(np.array(d)) for d in fetch_ret] print( "Pass [%d/%d], batch [%d/%d], loss %s, acc1: %s, acc5: %s, avg batch time %.4f" % (pass_id, args.num_epochs, batch_id, steps_per_pass, fetched_data[0], fetched_data[1], fetched_data[2], (time.time() - start_time) / batch_id)) else: fetch_ret = exe.run([]) except fluid.core.EOFException: break except fluid.core.EnforceNotMet: traceback.print_exc() break num_samples += args.batch_size batch_id += 1 if args.skip_unbalanced_data and batch_id >= steps_per_pass: break print_train_time(start_time, time.time(), num_samples) train_pyreader.reset() if pass_id >= args.start_test_pass: if args.multi_batch_repeat > 1: copyback_repeat_bn_params(train_prog) test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name] test_ret = test_single(startup_exe, test_prog, args, test_pyreader, test_fetch_list) # NOTE: switch to below line if you use ParallelExecutor to run test. # test_ret = test_parallel(test_exe, test_prog, args, test_pyreader,test_fetch_list) print("Pass: %d, Test Loss %s, test acc1: %s, test acc5: %s\n" % (pass_id, test_ret[0], test_ret[1], test_ret[2])) model_path = os.path.join(args.model_save_dir + '/' + args.model, str(pass_id)) print("saving model to ", model_path) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(startup_exe, model_path, main_program=train_prog) startup_exe.close() print("total train time: ", time.time() - over_all_start)
def train_parallel(args): train_prog = fluid.Program() test_prog = fluid.Program() startup_prog = fluid.Program() train_pyreader, train_cost, train_acc1, train_acc5 = build_program( True, train_prog, startup_prog, args) test_pyreader, test_cost, test_acc1, test_acc5 = build_program( False, test_prog, startup_prog, args) if args.update_method == "pserver": train_prog, startup_prog = pserver_prepare(args, train_prog, startup_prog) elif args.update_method == "nccl2": nccl2_prepare(args, startup_prog) if args.dist_env["training_role"] == "PSERVER": run_pserver(train_prog, startup_prog) exit(0) if args.use_gpu: # NOTE: for multi process mode: one process per GPU device. gpu_id = 0 if os.getenv("FLAGS_selected_gpus"): gpu_id = int(os.getenv("FLAGS_selected_gpus")) place = core.CUDAPlace(gpu_id) if args.use_gpu else core.CPUPlace() startup_exe = fluid.Executor(place) if args.multi_batch_repeat > 1: append_bn_repeat_init_op(train_prog, startup_prog, args.multi_batch_repeat) startup_exe.run(startup_prog) strategy = fluid.ExecutionStrategy() strategy.num_threads = args.num_threads build_strategy = fluid.BuildStrategy() if args.multi_batch_repeat > 1: pass_builder = build_strategy._finalize_strategy_and_create_passes() mypass = pass_builder.insert_pass( len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass") mypass.set_int("num_repeats", args.multi_batch_repeat) if args.reduce_strategy == "reduce": build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.Reduce else: build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.AllReduce if args.update_method == "pserver" or args.update_method == "local": # parameter server mode distributed training, merge # gradients on local server, do not initialize # ParallelExecutor with multi server all-reduce mode. num_trainers = 1 trainer_id = 0 else: num_trainers = args.dist_env["num_trainers"] trainer_id = args.dist_env["trainer_id"] exe = fluid.ParallelExecutor(True, train_cost.name, main_program=train_prog, exec_strategy=strategy, build_strategy=build_strategy, num_trainers=num_trainers, trainer_id=trainer_id) over_all_start = time.time() fetch_list = [train_cost.name, train_acc1.name, train_acc5.name] steps_per_pass = args.total_images / args.batch_size / args.dist_env[ "num_trainers"] for pass_id in range(args.num_epochs): num_samples = 0 start_time = time.time() batch_id = 1 # use pass_id+1 as per pass global shuffle for distributed training prepare_reader(True, train_pyreader, args, pass_id + 1) train_pyreader.start() while True: try: if batch_id % 30 == 0: fetch_ret = exe.run(fetch_list) fetched_data = [np.mean(np.array(d)) for d in fetch_ret] print( "Pass %d, batch %d, loss %s, acc1: %s, acc5: %s, avg batch time %.4f" % (pass_id, batch_id, fetched_data[0], fetched_data[1], fetched_data[2], (time.time() - start_time) / batch_id)) else: fetch_ret = exe.run([]) except fluid.core.EOFException: break except fluid.core.EnforceNotMet: traceback.print_exc() break num_samples += args.batch_size batch_id += 1 if args.skip_unbalanced_data and batch_id >= steps_per_pass: break print_train_time(start_time, time.time(), num_samples) train_pyreader.reset() if pass_id > args.start_test_pass: if args.multi_batch_repeat > 1: copyback_repeat_bn_params(train_prog) test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name] test_ret = test_single(startup_exe, test_prog, args, test_pyreader, test_fetch_list) print("Pass: %d, Test Loss %s, test acc1: %s, test acc5: %s\n" % (pass_id, test_ret[0], test_ret[1], test_ret[2])) startup_exe.close() print("total train time: ", time.time() - over_all_start)