def distribute_train(args): # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色 # 然后使用 fleet api的 init()方法初始化这个节点 role = role_maker.PaddleCloudRoleMaker() fleet.init(role) # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置 # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点 strategy = DistributeTranspilerConfig() strategy.sync_mode = False strategy.runtime_split_send_recv = True ctr_model = CTR() inputs = ctr_model.input_data(args) avg_cost, auc_var = ctr_model.net(inputs, args) # 配置分布式的optimizer,传入我们指定的strategy,构建program optimizer = fluid.optimizer.Adam(args.learning_rate) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) # 根据节点角色,分别运行不同的逻辑 if fleet.is_server(): # 初始化及运行参数服务器节点 fleet.init_server() fleet.run_server() elif fleet.is_worker(): # 初始化工作节点 fleet.init_worker() exe = fluid.Executor(fluid.CPUPlace()) # 初始化含有分布式流程的fleet.startup_program exe.run(fleet.startup_program) dataset, file_list = get_dataset(inputs, args) for epoch in range(args.epochs): # 以文件为粒度进行shuffle random.shuffle(file_list) dataset.set_filelist(file_list) # 训练节点运行的是经过分布式裁剪的fleet.mian_program start_time = time.time() exe.train_from_dataset(program=fleet.main_program, dataset=dataset, fetch_list=[auc_var], fetch_info=["Epoch {} auc ".format(epoch)], print_period=100, debug=False) end_time = time.time() logger.info("epoch %d finished, use time=%d\n" % ((epoch), end_time - start_time)) # 默认使用0号节点保存模型 if args.save_model and fleet.is_first_worker(): model_path = os.path.join(str(args.model_path), "epoch_" + str(epoch)) fleet.save_persistables(executor=exe, dirname=model_path) fleet.stop_worker() logger.info("Distribute Train Success!")
def train_prog(exe, program, model, pyreader, args): trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) start = time.time() batch = 0 total_loss = 0. total_acc = 0. total_sample = 0 for epoch_idx in range(args.num_epoch): for step, batch_feed_dict in enumerate(pyreader()): try: cpu_time = time.time() batch += 1 batch_loss, batch_acc = exe.run( program, feed=batch_feed_dict, fetch_list=[model.loss, model.acc]) end = time.time() if batch % args.log_per_step == 0: log.info( "Batch %s Loss %s Acc %s \t Speed(per batch) %.5lf/%.5lf sec" % (batch, np.mean(batch_loss), np.mean(batch_acc), (end - start) / batch, (end - cpu_time))) if step % args.steps_per_save == 0: save_path = args.save_path if trainer_id == 0: model_path = os.path.join(save_path, "%s" % step) fleet.save_persistables(exe, model_path) except Exception as e: log.info("Pyreader train error") log.exception(e)
def save_model(self, FLAGS, net_output, global_step): """ save model """ if (global_step != "final" and global_step % FLAGS.save_model_steps != 0) \ or not fleet.is_first_worker(): return path = "%s/checkpoint_%s" % (FLAGS.train_dir, global_step) fleet.save_inference_model(self.paddle_env['exe'], path, net_output['model_output']['feeded_var_names'], net_output['model_output']['fetch_targets']) #or fleet.save_persistables(self.paddle_env['exe'], path) self.record_checkpoint(FLAGS, global_step)
def save_persistables(): save_interval = envs.get_global_env( "save.increment.epoch_interval", -1, namespace) if not need_save(epoch_id, save_interval, False): return dirname = envs.get_global_env("save.increment.dirname", None, namespace) assert dirname is not None dirname = os.path.join(dirname, str(epoch_id)) if is_fleet: fleet.save_persistables(self._exe, dirname) else: fluid.io.save_persistables(self._exe, dirname) self.increment_models.append((epoch_id, dirname))
def train_prog(exe, program, loss, node2vec_pyreader, args, train_steps): trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) step = 0 if not os.path.exists(args.save_path): os.makedirs(args.save_path) while True: try: begin_time = time.time() loss_val, = exe.run(program, fetch_list=[loss]) log.info("step %s: loss %.5f speed: %.5f s/step" % (step, np.mean(loss_val), time.time() - begin_time)) step += 1 except F.core.EOFException: node2vec_pyreader.reset() if step % args.steps_per_save == 0 or step == train_steps: save_path = args.save_path if trainer_id == 0: model_path = os.path.join(save_path, "%s" % step) fleet.save_persistables(exe, model_path) if step == train_steps: break
def save(predict,savaPath,exe): if not os.path.exists(savaPath): os.makedirs(savaPath) print('save models to %s' % (savaPath)) fleet.save_persistables(exe,savaPath)
def train(use_cuda, save_dirname, is_local, is_increment): """ train """ # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model() old_model = None model_args = model() predict = model_args['predict'] avg_cost = model_args['avg_cost'] feed_order = model_args['feed_order'] loader = model_args['loader'] auc_batch = model_args['auc'][1] # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化 sgd_optimizer = AdamOptimizer(learning_rate=2e-4) # sgd_optimizer = fluid.optimizer.Adam(learning_rate=2e-5) if is_local: sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) readers = [] for i in range(16): readers.append(data_reader(cluster_train_dir)) multi_readers = paddle.reader.multiprocess_reader(readers) loader.set_sample_generator( multi_readers, batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM)) # data_reader(cluster_train_dir), batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM)) # feeder = fluid.DataFeeder(feed_order, place) # train_reader = feeder.decorate_reader( # paddle.batch(paddle.reader.shuffle( # data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE), # multi_devices=False, drop_last=True) start_program = fluid.default_startup_program() exe.run(start_program) main_prog = fluid.default_main_program() exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM * 2 build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce # cpu reduce faster build_strategy.fuse_broadcast_ops = True # build_strategy.async_mode = True main_program = fluid.CompiledProgram(main_prog).with_data_parallel( loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy) #loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy, places=fluid.cpu_places(CPU_NUM)) if is_increment: # load model to fine-tune fluid.io.load_params(exe, old_model, main_program) for auc_state in model_args['auc'][2]: set_zero(place, fluid.global_scope(), auc_state.name) # 并行训练,速度更快 # train_pe = fluid.ParallelExecutor(use_cuda=use_cuda, # main_program=main_program, loss_name=avg_cost.name, # exec_strategy=exec_strategy, build_strategy=build_strategy) cost_list = [] auc_list = [] import time pass_s_time = time.time() for pass_id in range(PASS_NUM): s_time = time.time() for batch_id, data in enumerate(loader()): r_time = time.time() - s_time st_time = time.time() cost_value, auc_value = exe.run( program=main_program, feed=data, fetch_list=[avg_cost.name, auc_batch.name]) t_time = time.time() - st_time cost_list.append(np.array(cost_value)) auc_list.append(np.array(auc_value)) if batch_id % 10 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s auc %s readtime %f triantime %f" % \ (pass_id, batch_id, np.array(cost_list).mean(), np.array(auc_list).mean(), r_time, t_time) cost_list = [] auc_list = [] if batch_id % 1000 == 0: if save_dirname is not None: fluid.io.save_inference_model( save_dirname, feed_order, [predict, avg_cost, auc_batch], exe ) fluid.io.save_persistables(exe, save_dirname) infer(cluster_test_dir, save_dirname, feed_order) s_time = time.time() pass_time = time.time() - pass_s_time print("Pass train time: %f" % pass_time) else: role = role_maker.PaddleCloudRoleMaker() # 全异步训练 config = DistributeTranspilerConfig() config.sync_mode = False config.runtime_split_send_recv = True # 加入 fleet init 初始化环境 fleet.init(role) optimizer = fleet.distributed_optimizer(sgd_optimizer, config) optimizer.minimize(avg_cost) if fleet.is_server(): fleet.init_server() fleet.run_server() # 启动worker if fleet.is_worker(): # 初始化worker配置 fleet.init_worker() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) feeder = fluid.DataFeeder(feed_order, place) train_reader = feeder.decorate_reader( paddle.batch(paddle.reader.shuffle( data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE), multi_devices=False, drop_last=True) exe.run(fleet.startup_program) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM build_strategy = fluid.BuildStrategy() build_strategy.async_mode = True if CPU_NUM > 1: build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce compiled_prog = fluid.compiler.CompiledProgram( fleet.main_program).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) for pass_id in range(PASS_NUM): cost_list = [] auc_list = [] import time s_time = time.time() for batch_id, data in enumerate(train_reader()): r_time = time.time() - s_time cost_value, auc_value = exe.run( program=compiled_prog, feed=data, fetch_list=[avg_cost.name, auc_batch.name]) t_time = time.time() - r_time cost_list.append(np.array(cost_value)) auc_list.append(np.array(auc_value)) if batch_id % 10 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s auc %s readtime %f traintime %f" % \ (pass_id, batch_id, np.array(cost_list).mean(), np.array(auc_list).mean(), r_time, t_time) cost_list = [] auc_list = [] if batch_id % 1000 == 0 and fleet.is_first_worker(): if save_dirname is not None: fleet.save_inference_model( exe, save_dirname, feed_order, [predict, avg_cost, auc_batch] ) fleet.save_persistables(exe, save_dirname) infer(cluster_test_dir, save_dirname, feed_order) s_time = time.time() fleet.stop_worker()
def train(args): """run train""" # set random program = fluid.default_main_program() program.random_seed = args.random_seed # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色 # 然后使用 fleet api的 init()方法初始化这个节点 role = role_maker.PaddleCloudRoleMaker() fleet.init(role) # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置 # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点 if args.sync_mode == "sync": strategy = StrategyFactory.create_sync_strategy() elif args.sync_mode == "half_async": strategy = StrategyFactory.create_half_async_strategy() elif args.sync_mode == "async": strategy = StrategyFactory.create_async_strategy() # set model logger.info("TDM Begin build network.") tdm_model = TdmTrainNet(args) inputs = tdm_model.input_data() logger.info("TDM Begin load tree travel & layer.") avg_cost, acc = tdm_model.tdm(inputs) logger.info("TDM End build network.") # 配置分布式的optimizer,传入我们指定的strategy,构建program optimizer = fluid.optimizer.AdamOptimizer(learning_rate=args.learning_rate, lazy_mode=True) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) logger.info("TDM End append backward.") # 根据节点角色,分别运行不同的逻辑 if fleet.is_server(): logger.info("TDM Run server ...") # 初始化及运行参数服务器节点 logger.info("TDM init model path: {}".format( args.init_model_files_path)) # 模型中除了tdm树结构相关的变量都应该在此处初始化 fleet.init_server(args.init_model_files_path) lr = fluid.global_scope().find_var("learning_rate_0") if lr: lr.get_tensor().set( np.array(args.learning_rate).astype('float32'), fluid.CPUPlace()) logger.info("TDM Set learning rate {}".format(args.learning_rate)) else: logger.info("TDM Didn't find learning_rate_0 param") logger.info("TDM load End") fleet.run_server() logger.info("TDM Run server success!") elif fleet.is_worker(): logger.info("TDM Run worker ...") # 初始化工作节点 fleet.init_worker() place = fluid.CPUPlace() exe = fluid.Executor(place) logger.info("TDM Run Startup Begin") # 初始化含有分布式流程的fleet.startup_program exe.run(fleet.startup_program) # Set Learning Rate lr = fluid.global_scope().find_var("learning_rate_0") if lr: lr.get_tensor().set( np.array(args.learning_rate).astype('float32'), place) logger.info("TDM Set learning rate {}".format(args.learning_rate)) # Set TDM Variable logger.info("TDM Begin load parameter.") # Set TDM_Tree_Info # 树结构相关的变量不参与网络更新,不存储于参数服务器,因此需要在本地手动Set tdm_param_prepare_dict = tdm_sampler_prepare(args) tdm_param_prepare_dict['info_array'] = tdm_child_prepare(args) Numpy_model = {} Numpy_model['TDM_Tree_Travel'] = tdm_param_prepare_dict['travel_array'] Numpy_model['TDM_Tree_Layer'] = tdm_param_prepare_dict['layer_array'] Numpy_model['TDM_Tree_Info'] = tdm_param_prepare_dict['info_array'] # Numpy_model['TDM_Tree_Emb'] = tdm_emb_prepare(args) # 分布式训练中,Emb存储与参数服务器,无需在本地set for param_name in Numpy_model: param_t = fluid.global_scope().find_var(param_name).get_tensor() param_t.set(Numpy_model[str(param_name)].astype('int32'), place) logger.info("TDM Run Startup End") # Train loop dataset, file_list, example_num = get_dataset(inputs, args) logger.info("TDM Distributed training begin ...") for epoch in range(args.epoch_num): # local shuffle random.shuffle(file_list) dataset.set_filelist(file_list) # 训练节点运行的是经过分布式裁剪的fleet.mian_program start_time = time.time() exe.train_from_dataset(program=fleet.main_program, dataset=dataset, fetch_list=[acc, avg_cost], fetch_info=[ "Epoch {} acc ".format(epoch), "Epoch {} loss ".format(epoch) ], print_period=1, debug=False) end_time = time.time() logger.info( "Epoch {} finished, use time {} second, speed {} example/s". format(epoch, end_time - start_time, example_num * 1.0 / (end_time - start_time))) # 默认使用0号节点保存模型 if fleet.is_first_worker(): model_path = os.path.join(args.model_files_path, "epoch_" + str(epoch)) fleet.save_persistables(executor=exe, dirname=model_path) logger.info("Begin upload files") # upload_files(model_path, warm_up=False) # 在分布式环境下时,支持上传模型到hdfs logger.info("TDM Before stop worker") fleet.stop_worker() logger.info("TDM Distributed training success!")
def train(use_cuda, train_sample_dir, test_sample_dir, old_model, output_model, is_local, is_increment): """ train """ # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model() model_args = model() navi_predict = model_args['predict'][0] voice_navi_predict = model_args['predict'][1] speed_navi_predict = model_args['predict'][2] avg_cost = model_args['avg_cost'] feed_order = model_args['feed_order'] role = role_maker.PaddleCloudRoleMaker() # 全异步训练 config = DistributeTranspilerConfig() config.sync_mode = False config.runtime_split_send_recv = True sgd_optimizer = AdamOptimizer(learning_rate=2e-4) if is_local: sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) # train_reader = paddle.batch( # paddle.reader.shuffle( # streaming_data_reader(), buf_size=8192), batch_size=BATCH_SIZE) feeder = fluid.DataFeeder(feed_order, place) train_reader = feeder.decorate_reader(paddle.batch( paddle.reader.shuffle(streaming_data_reader(), buf_size=8192), batch_size=BATCH_SIZE), multi_devices=False, drop_last=True) start_program = fluid.default_startup_program() exe.run(start_program) main_program = fluid.default_main_program() if is_increment: # load model to fine-tune fluid.io.load_params(exe, old_model, main_program) # for auc_state in model_args['auc'][2]: # set_zero(place, fluid.global_scope(), auc_state.name) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM main_program.num_threads = CPU_NUM build_strategy = fluid.BuildStrategy() build_strategy.async_mode = True # 并行训练,速度更快 train_pe = fluid.ParallelExecutor(use_cuda=use_cuda, main_program=main_program, loss_name=avg_cost.name) cost_list = [] for pass_id in range(PASS_NUM): for batch_id, data in enumerate(train_reader()): cost_value = train_pe.run(feed=data, fetch_list=[avg_cost.name]) cost_list.append(np.array(cost_value)) if batch_id % 100 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s" % \ (pass_id, batch_id, np.array(cost_list).mean()) cost_list = [] if batch_id % 2000 == 0: if output_model is not None: fluid.io.save_inference_model( output_model, feed_order, [ navi_predict, voice_navi_predict, speed_navi_predict, avg_cost ], exe) fluid.io.save_persistables(exe, output_model) infer(test_sample_dir, output_model, feed_order) else: # 加入 fleet init 初始化环境 fleet.init(role) # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化 optimizer = fleet.distributed_optimizer(sgd_optimizer, config) optimizer.minimize(avg_cost) if fleet.is_server(): if is_increment: fleet.init_server(old_model) else: fleet.init_server() fleet.run_server() # 启动worker if fleet.is_worker(): # 初始化worker配置 fleet.init_worker() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) # train_reader = paddle.batch( # paddle.reader.shuffle( # data_reader(train_sample_dir), buf_size=8192), batch_size=BATCH_SIZE) feeder = fluid.DataFeeder(feed_order, place) train_reader = feeder.decorate_reader(paddle.batch( paddle.reader.shuffle(data_reader(train_sample_dir), buf_size=8192), batch_size=BATCH_SIZE), multi_devices=False, drop_last=True) exe.run(fleet.startup_program) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM build_strategy = fluid.BuildStrategy() build_strategy.async_mode = True if CPU_NUM > 1: build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce compiled_prog = fluid.compiler.CompiledProgram( fleet.main_program).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) cost_list = [] for pass_id in range(PASS_NUM): for batch_id, data in enumerate(train_reader()): cost_value = exe.run(program=compiled_prog, feed=data, fetch_list=[avg_cost.name]) cost_list.append(np.array(cost_value)) if batch_id % 100 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s" % \ (pass_id, batch_id, np.array(cost_list).mean()) cost_list = [] if batch_id % 1000 == 0 and fleet.is_first_worker(): if output_model is not None: fleet.save_inference_model( exe, output_model, feed_order, [ navi_predict, voice_navi_predict, speed_navi_predict, avg_cost ]) fleet.save_persistables(exe, output_model) infer(test_sample_dir, output_model, feed_order) fleet.stop_worker()