def distribute_train(args): # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色 # 然后使用 fleet api的 init()方法初始化这个节点 role = role_maker.PaddleCloudRoleMaker() fleet.init(role) # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置 # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点 strategy = DistributeTranspilerConfig() strategy.sync_mode = False strategy.runtime_split_send_recv = True ctr_model = CTR() inputs = ctr_model.input_data(args) avg_cost, auc_var = ctr_model.net(inputs, args) # 配置分布式的optimizer,传入我们指定的strategy,构建program optimizer = fluid.optimizer.Adam(args.learning_rate) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) # 根据节点角色,分别运行不同的逻辑 if fleet.is_server(): # 初始化及运行参数服务器节点 fleet.init_server() fleet.run_server() elif fleet.is_worker(): # 初始化工作节点 fleet.init_worker() exe = fluid.Executor(fluid.CPUPlace()) # 初始化含有分布式流程的fleet.startup_program exe.run(fleet.startup_program) dataset, file_list = get_dataset(inputs, args) for epoch in range(args.epochs): # 以文件为粒度进行shuffle random.shuffle(file_list) dataset.set_filelist(file_list) # 训练节点运行的是经过分布式裁剪的fleet.mian_program start_time = time.time() exe.train_from_dataset(program=fleet.main_program, dataset=dataset, fetch_list=[auc_var], fetch_info=["Epoch {} auc ".format(epoch)], print_period=100, debug=False) end_time = time.time() logger.info("epoch %d finished, use time=%d\n" % ((epoch), end_time - start_time)) # 默认使用0号节点保存模型 if args.save_model and fleet.is_first_worker(): model_path = os.path.join(str(args.model_path), "epoch_" + str(epoch)) fleet.save_persistables(executor=exe, dirname=model_path) fleet.stop_worker() logger.info("Distribute Train Success!")
def run_pserver(self, args): if args.role.upper() != "PSERVER": raise ValueError("args role must be PSERVER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.SERVER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) strategy = DistributeTranspilerConfig() strategy.sync_mode = args.sync_mode strategy.geo_sgd_mode = args.geo_sgd_mode strategy.geo_sgd_need_push_nums = args.geo_sgd_need_push_nums avg_cost = self.net() optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_server() fleet.run_server()
def main(args): log.info("start") worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) num_devices = int(os.getenv("CPU_NUM", 10)) model = GraphsageModel(args) loss = model.forward() train_iter = reader.get_iter(args, model.graph_wrapper, 'train') pyreader = fake_py_reader(train_iter, num_devices) # init fleet init_role() optimization(args.lr, loss, args.optimizer) # init and run server or worker if fleet.is_server(): fleet.init_server(args.warm_start_from_dir) fleet.run_server() if fleet.is_worker(): log.info("start init worker done") fleet.init_worker() #just the worker, load the sample log.info("init worker done") exe = F.Executor(F.CPUPlace()) exe.run(fleet.startup_program) log.info("Startup done") compiled_prog = build_complied_prog(fleet.main_program, loss) train_prog(exe, compiled_prog, model, pyreader, args)
def run_pserver(self, args): fleet.init(self.build_role(args)) strategy = self.build_strategy(args) avg_cost = self.net(args) self.build_optimizer(avg_cost, strategy) fleet.init_server() fleet.run_server()
def init_and_run_ps_worker(self, ckpt_path): # init and run server or worker self.exe = F.Executor(F.CPUPlace()) if tfleet.is_server(): tfleet.init_server() self.warmstart(tfleet.startup_program, path=ckpt_path) tfleet.run_server() exit() if tfleet.is_worker(): log.info("start init worker done") tfleet.init_worker() self.exe.run(tfleet.startup_program)
def run_pserver(self, args): """run pserver""" from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet import paddle.fluid as fluid from paddle.fluid.transpiler.ps_dispatcher import RoundRobin from paddle.fluid.transpiler.ps_dispatcher import HashName fluid.default_startup_program().random_seed = 1 fluid.default_main_program().random_seed = 1 if args.role.upper() != "PSERVER": raise ValueError("args role must be PSERVER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.SERVER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) strategy = DistributeTranspilerConfig() strategy.sync_mode = args.run_params["sync_mode"] strategy.async_mode = args.run_params["async_mode"] strategy.mode = "pserver" strategy.slice_var_up = args.run_params['slice_var_up'] strategy.enable_dc_asgd = args.run_params['enable_dc_asgd'] if args.run_params['split_method']: strategy.split_method = HashName strategy.split_method = RoundRobin strategy.wait_port = args.run_params['wait_port'] strategy.runtime_split_send_recv = args.run_params['runtime_split_send_recv'] strategy.use_hierarchical_allreduce = args.run_params['use_hierarchical_allreduce'] #strategy.hierarchical_allreduce_exter_nranks = args.run_params['hierarchical_allreduce_exter_nranks'] #strategy.hierarchical_allreduce_inter_nranks = args.run_params['hierarchical_allreduce_inter_nranks'] strategy.geo_sgd_mode = args.run_params['geo_sgd'] strategy.geo_sgd_need_push_nums = args.run_params['push_nums'] avg_cost = self.net(args) optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_server() fleet.run_server()
def train(args): import logging log.setLevel(logging.DEBUG) log.info("start") worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) num_devices = int(os.getenv("CPU_NUM", 10)) model = DeepwalkModel(args.num_nodes, args.hidden_size, args.neg_num, args.is_sparse, args.is_distributed, 1.) pyreader = model.pyreader loss = model.forward() # init fleet init_role() train_steps = math.ceil(1. * args.num_nodes * args.epoch / args.batch_size / num_devices / worker_num) log.info("Train step: %s" % train_steps) if args.optimizer == "sgd": args.lr *= args.batch_size * args.walk_len * args.win_size optimization(args.lr, loss, train_steps, args.optimizer) # init and run server or worker if fleet.is_server(): fleet.init_server(args.warm_start_from_dir) fleet.run_server() if fleet.is_worker(): log.info("start init worker done") fleet.init_worker() #just the worker, load the sample log.info("init worker done") exe = F.Executor(F.CPUPlace()) exe.run(fleet.startup_program) log.info("Startup done") if args.dataset is not None: if args.dataset == "BlogCatalog": graph = data_loader.BlogCatalogDataset().graph elif args.dataset == "ArXiv": graph = data_loader.ArXivDataset().graph else: raise ValueError(args.dataset + " dataset doesn't exists") log.info("Load buildin BlogCatalog dataset done.") elif args.walkpath_files is None or args.walkpath_files == "None": graph = build_graph(args.num_nodes, args.edge_path) log.info("Load graph from '%s' done." % args.edge_path) else: graph = build_fake_graph(args.num_nodes) log.info("Load fake graph done.") # bind gen gen_func = build_gen_func(args, graph) pyreader.decorate_tensor_provider(gen_func) pyreader.start() compiled_prog = build_complied_prog(fleet.main_program, loss) train_prog(exe, compiled_prog, loss, pyreader, args, train_steps)
def server(self, context): fleet.init_server() fleet.run_server() context['is_exit'] = True
def fit(): role = role_maker.UserDefinedRoleMaker( current_id=current_id, role=role_maker.Role.WORKER if bool(1==int(roles)) else role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011"]) fleet.init(role) BATCH_SIZE = 128 type_size=createDataList(in_file_path,in_file_path+'.data'+"/") # 用于训练的数据提供器 train_reader=paddle.batch(reader=paddle.reader.shuffle(reader=dataReader(in_file_path+".data/trainer.list"),buf_size=BATCH_SIZE*100), batch_size=BATCH_SIZE) test_reader=paddle.batch(reader=paddle.reader.shuffle(reader=dataReader(in_file_path+".data/test.list"),buf_size=BATCH_SIZE*100), batch_size=BATCH_SIZE) data_shape = [3, 32, 32] images = fluid.layers.data(name='images', shape=data_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') # 获取分类器 predict = networkConfiguration(images,type_size) # 定义损失函数和准确率 cost = fluid.layers.cross_entropy(input=predict, label=label) # 交叉熵 avg_cost = fluid.layers.mean(cost) # 计算cost中所有元素的平均值 acc = fluid.layers.accuracy(input=predict, label=label) # 使用输入和标签计算准确率 # 定义优化方法 test_program = fluid.default_main_program().clone(for_test=True) # 获取测试程序 optimizer = fluid.optimizer.Adam(learning_rate=0.001) strategy = DistributeTranspilerConfig() strategy.sync_mode = True optimizer = fleet.distributed_optimizer(optimizer,strategy) # 定义优化方法 optimizer.minimize(avg_cost) if fleet.is_server(): print("启动server") fleet.init_server() fleet.run_server() elif fleet.is_worker(): print("启动worker") fleet.init_worker() print(fleet.worker_endpoints()) ########## 模型训练&模型评估 ########## # 创建Executor use_cuda = False # 定义使用CPU还是GPU,使用CPU时use_cuda=False place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() print("cpu") # 定义数据映射器 feeder = fluid.DataFeeder(feed_list=[images, label], place=place) print("数据映射") exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) for pass_id in range(EPOCH_NUM): print(pass_id) # 开始训练 for batch_id, data in enumerate(train_reader()): # 遍历train_reader train_cost, train_acc = exe.run(program=fluid.default_main_program(), # 运行主程序 feed=feeder.feed(data), # 喂入一个batch的数据 fetch_list=[avg_cost, acc]) # fetch均方误差和准确率 # fetch均方误差和准确率 # 每100次batch打印一次训练、进行一次测试 if batch_id % 20 == 0: print('Pass:%d, Batch:%d, Cost:%0.5f, Accuracy:%0.5f' %(pass_id, batch_id, train_cost[0], train_acc[0])) # 开始测试 test_costs = [] # 测试的损失值 test_accs = [] # 测试的准确率 for batch_id, data in enumerate(test_reader()): test_cost, test_acc = exe.run(program=test_program, # 执行训练程序 feed=feeder.feed(data), # 喂入数据 fetch_list=[avg_cost, acc]) # fetch误差、准确率 test_costs.append(test_cost[0]) # 记录每个batch的损失值 test_accs.append(test_acc[0]) # 记录每个batch的准确率 test_cost = (sum(test_costs) / len(test_costs)) # 计算误差平均值 test_acc = (sum(test_accs) / len(test_accs)) # 计算准确率平均值 print('Test:%d, Cost:%0.5f, ACC:%0.5f' % (pass_id, test_cost, test_acc)) save(predict,model_file_path,exe) fleet.stop_worker()
def train(use_cuda, save_dirname, is_local, is_increment): """ train """ # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model() old_model = None model_args = model() predict = model_args['predict'] avg_cost = model_args['avg_cost'] feed_order = model_args['feed_order'] loader = model_args['loader'] auc_batch = model_args['auc'][1] # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化 sgd_optimizer = AdamOptimizer(learning_rate=2e-4) # sgd_optimizer = fluid.optimizer.Adam(learning_rate=2e-5) if is_local: sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) readers = [] for i in range(16): readers.append(data_reader(cluster_train_dir)) multi_readers = paddle.reader.multiprocess_reader(readers) loader.set_sample_generator( multi_readers, batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM)) # data_reader(cluster_train_dir), batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM)) # feeder = fluid.DataFeeder(feed_order, place) # train_reader = feeder.decorate_reader( # paddle.batch(paddle.reader.shuffle( # data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE), # multi_devices=False, drop_last=True) start_program = fluid.default_startup_program() exe.run(start_program) main_prog = fluid.default_main_program() exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM * 2 build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce # cpu reduce faster build_strategy.fuse_broadcast_ops = True # build_strategy.async_mode = True main_program = fluid.CompiledProgram(main_prog).with_data_parallel( loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy) #loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy, places=fluid.cpu_places(CPU_NUM)) if is_increment: # load model to fine-tune fluid.io.load_params(exe, old_model, main_program) for auc_state in model_args['auc'][2]: set_zero(place, fluid.global_scope(), auc_state.name) # 并行训练,速度更快 # train_pe = fluid.ParallelExecutor(use_cuda=use_cuda, # main_program=main_program, loss_name=avg_cost.name, # exec_strategy=exec_strategy, build_strategy=build_strategy) cost_list = [] auc_list = [] import time pass_s_time = time.time() for pass_id in range(PASS_NUM): s_time = time.time() for batch_id, data in enumerate(loader()): r_time = time.time() - s_time st_time = time.time() cost_value, auc_value = exe.run( program=main_program, feed=data, fetch_list=[avg_cost.name, auc_batch.name]) t_time = time.time() - st_time cost_list.append(np.array(cost_value)) auc_list.append(np.array(auc_value)) if batch_id % 10 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s auc %s readtime %f triantime %f" % \ (pass_id, batch_id, np.array(cost_list).mean(), np.array(auc_list).mean(), r_time, t_time) cost_list = [] auc_list = [] if batch_id % 1000 == 0: if save_dirname is not None: fluid.io.save_inference_model( save_dirname, feed_order, [predict, avg_cost, auc_batch], exe ) fluid.io.save_persistables(exe, save_dirname) infer(cluster_test_dir, save_dirname, feed_order) s_time = time.time() pass_time = time.time() - pass_s_time print("Pass train time: %f" % pass_time) else: role = role_maker.PaddleCloudRoleMaker() # 全异步训练 config = DistributeTranspilerConfig() config.sync_mode = False config.runtime_split_send_recv = True # 加入 fleet init 初始化环境 fleet.init(role) optimizer = fleet.distributed_optimizer(sgd_optimizer, config) optimizer.minimize(avg_cost) if fleet.is_server(): fleet.init_server() fleet.run_server() # 启动worker if fleet.is_worker(): # 初始化worker配置 fleet.init_worker() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) feeder = fluid.DataFeeder(feed_order, place) train_reader = feeder.decorate_reader( paddle.batch(paddle.reader.shuffle( data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE), multi_devices=False, drop_last=True) exe.run(fleet.startup_program) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM build_strategy = fluid.BuildStrategy() build_strategy.async_mode = True if CPU_NUM > 1: build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce compiled_prog = fluid.compiler.CompiledProgram( fleet.main_program).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) for pass_id in range(PASS_NUM): cost_list = [] auc_list = [] import time s_time = time.time() for batch_id, data in enumerate(train_reader()): r_time = time.time() - s_time cost_value, auc_value = exe.run( program=compiled_prog, feed=data, fetch_list=[avg_cost.name, auc_batch.name]) t_time = time.time() - r_time cost_list.append(np.array(cost_value)) auc_list.append(np.array(auc_value)) if batch_id % 10 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s auc %s readtime %f traintime %f" % \ (pass_id, batch_id, np.array(cost_list).mean(), np.array(auc_list).mean(), r_time, t_time) cost_list = [] auc_list = [] if batch_id % 1000 == 0 and fleet.is_first_worker(): if save_dirname is not None: fleet.save_inference_model( exe, save_dirname, feed_order, [predict, avg_cost, auc_batch] ) fleet.save_persistables(exe, save_dirname) infer(cluster_test_dir, save_dirname, feed_order) s_time = time.time() fleet.stop_worker()
def train(args): """run train""" # set random program = fluid.default_main_program() program.random_seed = args.random_seed # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色 # 然后使用 fleet api的 init()方法初始化这个节点 role = role_maker.PaddleCloudRoleMaker() fleet.init(role) # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置 # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点 if args.sync_mode == "sync": strategy = StrategyFactory.create_sync_strategy() elif args.sync_mode == "half_async": strategy = StrategyFactory.create_half_async_strategy() elif args.sync_mode == "async": strategy = StrategyFactory.create_async_strategy() # set model logger.info("TDM Begin build network.") tdm_model = TdmTrainNet(args) inputs = tdm_model.input_data() logger.info("TDM Begin load tree travel & layer.") avg_cost, acc = tdm_model.tdm(inputs) logger.info("TDM End build network.") # 配置分布式的optimizer,传入我们指定的strategy,构建program optimizer = fluid.optimizer.AdamOptimizer(learning_rate=args.learning_rate, lazy_mode=True) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) logger.info("TDM End append backward.") # 根据节点角色,分别运行不同的逻辑 if fleet.is_server(): logger.info("TDM Run server ...") # 初始化及运行参数服务器节点 logger.info("TDM init model path: {}".format( args.init_model_files_path)) # 模型中除了tdm树结构相关的变量都应该在此处初始化 fleet.init_server(args.init_model_files_path) lr = fluid.global_scope().find_var("learning_rate_0") if lr: lr.get_tensor().set( np.array(args.learning_rate).astype('float32'), fluid.CPUPlace()) logger.info("TDM Set learning rate {}".format(args.learning_rate)) else: logger.info("TDM Didn't find learning_rate_0 param") logger.info("TDM load End") fleet.run_server() logger.info("TDM Run server success!") elif fleet.is_worker(): logger.info("TDM Run worker ...") # 初始化工作节点 fleet.init_worker() place = fluid.CPUPlace() exe = fluid.Executor(place) logger.info("TDM Run Startup Begin") # 初始化含有分布式流程的fleet.startup_program exe.run(fleet.startup_program) # Set Learning Rate lr = fluid.global_scope().find_var("learning_rate_0") if lr: lr.get_tensor().set( np.array(args.learning_rate).astype('float32'), place) logger.info("TDM Set learning rate {}".format(args.learning_rate)) # Set TDM Variable logger.info("TDM Begin load parameter.") # Set TDM_Tree_Info # 树结构相关的变量不参与网络更新,不存储于参数服务器,因此需要在本地手动Set tdm_param_prepare_dict = tdm_sampler_prepare(args) tdm_param_prepare_dict['info_array'] = tdm_child_prepare(args) Numpy_model = {} Numpy_model['TDM_Tree_Travel'] = tdm_param_prepare_dict['travel_array'] Numpy_model['TDM_Tree_Layer'] = tdm_param_prepare_dict['layer_array'] Numpy_model['TDM_Tree_Info'] = tdm_param_prepare_dict['info_array'] # Numpy_model['TDM_Tree_Emb'] = tdm_emb_prepare(args) # 分布式训练中,Emb存储与参数服务器,无需在本地set for param_name in Numpy_model: param_t = fluid.global_scope().find_var(param_name).get_tensor() param_t.set(Numpy_model[str(param_name)].astype('int32'), place) logger.info("TDM Run Startup End") # Train loop dataset, file_list, example_num = get_dataset(inputs, args) logger.info("TDM Distributed training begin ...") for epoch in range(args.epoch_num): # local shuffle random.shuffle(file_list) dataset.set_filelist(file_list) # 训练节点运行的是经过分布式裁剪的fleet.mian_program start_time = time.time() exe.train_from_dataset(program=fleet.main_program, dataset=dataset, fetch_list=[acc, avg_cost], fetch_info=[ "Epoch {} acc ".format(epoch), "Epoch {} loss ".format(epoch) ], print_period=1, debug=False) end_time = time.time() logger.info( "Epoch {} finished, use time {} second, speed {} example/s". format(epoch, end_time - start_time, example_num * 1.0 / (end_time - start_time))) # 默认使用0号节点保存模型 if fleet.is_first_worker(): model_path = os.path.join(args.model_files_path, "epoch_" + str(epoch)) fleet.save_persistables(executor=exe, dirname=model_path) logger.info("Begin upload files") # upload_files(model_path, warm_up=False) # 在分布式环境下时,支持上传模型到hdfs logger.info("TDM Before stop worker") fleet.stop_worker() logger.info("TDM Distributed training success!")
def train(use_cuda, train_sample_dir, test_sample_dir, old_model, output_model, is_local, is_increment): """ train """ # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model() model_args = model() navi_predict = model_args['predict'][0] voice_navi_predict = model_args['predict'][1] speed_navi_predict = model_args['predict'][2] avg_cost = model_args['avg_cost'] feed_order = model_args['feed_order'] role = role_maker.PaddleCloudRoleMaker() # 全异步训练 config = DistributeTranspilerConfig() config.sync_mode = False config.runtime_split_send_recv = True sgd_optimizer = AdamOptimizer(learning_rate=2e-4) if is_local: sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) # train_reader = paddle.batch( # paddle.reader.shuffle( # streaming_data_reader(), buf_size=8192), batch_size=BATCH_SIZE) feeder = fluid.DataFeeder(feed_order, place) train_reader = feeder.decorate_reader(paddle.batch( paddle.reader.shuffle(streaming_data_reader(), buf_size=8192), batch_size=BATCH_SIZE), multi_devices=False, drop_last=True) start_program = fluid.default_startup_program() exe.run(start_program) main_program = fluid.default_main_program() if is_increment: # load model to fine-tune fluid.io.load_params(exe, old_model, main_program) # for auc_state in model_args['auc'][2]: # set_zero(place, fluid.global_scope(), auc_state.name) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM main_program.num_threads = CPU_NUM build_strategy = fluid.BuildStrategy() build_strategy.async_mode = True # 并行训练,速度更快 train_pe = fluid.ParallelExecutor(use_cuda=use_cuda, main_program=main_program, loss_name=avg_cost.name) cost_list = [] for pass_id in range(PASS_NUM): for batch_id, data in enumerate(train_reader()): cost_value = train_pe.run(feed=data, fetch_list=[avg_cost.name]) cost_list.append(np.array(cost_value)) if batch_id % 100 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s" % \ (pass_id, batch_id, np.array(cost_list).mean()) cost_list = [] if batch_id % 2000 == 0: if output_model is not None: fluid.io.save_inference_model( output_model, feed_order, [ navi_predict, voice_navi_predict, speed_navi_predict, avg_cost ], exe) fluid.io.save_persistables(exe, output_model) infer(test_sample_dir, output_model, feed_order) else: # 加入 fleet init 初始化环境 fleet.init(role) # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化 optimizer = fleet.distributed_optimizer(sgd_optimizer, config) optimizer.minimize(avg_cost) if fleet.is_server(): if is_increment: fleet.init_server(old_model) else: fleet.init_server() fleet.run_server() # 启动worker if fleet.is_worker(): # 初始化worker配置 fleet.init_worker() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) # train_reader = paddle.batch( # paddle.reader.shuffle( # data_reader(train_sample_dir), buf_size=8192), batch_size=BATCH_SIZE) feeder = fluid.DataFeeder(feed_order, place) train_reader = feeder.decorate_reader(paddle.batch( paddle.reader.shuffle(data_reader(train_sample_dir), buf_size=8192), batch_size=BATCH_SIZE), multi_devices=False, drop_last=True) exe.run(fleet.startup_program) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM build_strategy = fluid.BuildStrategy() build_strategy.async_mode = True if CPU_NUM > 1: build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce compiled_prog = fluid.compiler.CompiledProgram( fleet.main_program).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) cost_list = [] for pass_id in range(PASS_NUM): for batch_id, data in enumerate(train_reader()): cost_value = exe.run(program=compiled_prog, feed=data, fetch_list=[avg_cost.name]) cost_list.append(np.array(cost_value)) if batch_id % 100 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s" % \ (pass_id, batch_id, np.array(cost_list).mean()) cost_list = [] if batch_id % 1000 == 0 and fleet.is_first_worker(): if output_model is not None: fleet.save_inference_model( exe, output_model, feed_order, [ navi_predict, voice_navi_predict, speed_navi_predict, avg_cost ]) fleet.save_persistables(exe, output_model) infer(test_sample_dir, output_model, feed_order) fleet.stop_worker()