def distribute_train(args): # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色 # 然后使用 fleet api的 init()方法初始化这个节点 role = role_maker.PaddleCloudRoleMaker() fleet.init(role) # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置 # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点 strategy = DistributeTranspilerConfig() strategy.sync_mode = False strategy.runtime_split_send_recv = True ctr_model = CTR() inputs = ctr_model.input_data(args) avg_cost, auc_var = ctr_model.net(inputs, args) # 配置分布式的optimizer,传入我们指定的strategy,构建program optimizer = fluid.optimizer.Adam(args.learning_rate) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) # 根据节点角色,分别运行不同的逻辑 if fleet.is_server(): # 初始化及运行参数服务器节点 fleet.init_server() fleet.run_server() elif fleet.is_worker(): # 初始化工作节点 fleet.init_worker() exe = fluid.Executor(fluid.CPUPlace()) # 初始化含有分布式流程的fleet.startup_program exe.run(fleet.startup_program) dataset, file_list = get_dataset(inputs, args) for epoch in range(args.epochs): # 以文件为粒度进行shuffle random.shuffle(file_list) dataset.set_filelist(file_list) # 训练节点运行的是经过分布式裁剪的fleet.mian_program start_time = time.time() exe.train_from_dataset(program=fleet.main_program, dataset=dataset, fetch_list=[auc_var], fetch_info=["Epoch {} auc ".format(epoch)], print_period=100, debug=False) end_time = time.time() logger.info("epoch %d finished, use time=%d\n" % ((epoch), end_time - start_time)) # 默认使用0号节点保存模型 if args.save_model and fleet.is_first_worker(): model_path = os.path.join(str(args.model_path), "epoch_" + str(epoch)) fleet.save_persistables(executor=exe, dirname=model_path) fleet.stop_worker() logger.info("Distribute Train Success!")
def main(args): log.info("start") worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) num_devices = int(os.getenv("CPU_NUM", 10)) model = GraphsageModel(args) loss = model.forward() train_iter = reader.get_iter(args, model.graph_wrapper, 'train') pyreader = fake_py_reader(train_iter, num_devices) # init fleet init_role() optimization(args.lr, loss, args.optimizer) # init and run server or worker if fleet.is_server(): fleet.init_server(args.warm_start_from_dir) fleet.run_server() if fleet.is_worker(): log.info("start init worker done") fleet.init_worker() #just the worker, load the sample log.info("init worker done") exe = F.Executor(F.CPUPlace()) exe.run(fleet.startup_program) log.info("Startup done") compiled_prog = build_complied_prog(fleet.main_program, loss) train_prog(exe, compiled_prog, model, pyreader, args)
def run_pserver(self, args): if args.role.upper() != "PSERVER": raise ValueError("args role must be PSERVER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.SERVER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) strategy = DistributeTranspilerConfig() strategy.sync_mode = args.sync_mode strategy.geo_sgd_mode = args.geo_sgd_mode strategy.geo_sgd_need_push_nums = args.geo_sgd_need_push_nums avg_cost = self.net() optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_server() fleet.run_server()
def run_pserver(self, args): fleet.init(self.build_role(args)) strategy = self.build_strategy(args) avg_cost = self.net(args) self.build_optimizer(avg_cost, strategy) fleet.init_server() fleet.run_server()
def run_pserver(self, role, strategy): fleet.init(role) avg_cost, x, y = self.net() optimizer = fluid.optimizer.SGD(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_server() fleet.run_server()
def server(self, context): namespace = "train.startup" init_model_path = envs.get_global_env("cluster.init_model_path", "", namespace) assert init_model_path != "", "Cluster train must has init_model for TDM" fleet.init_server(init_model_path) logger.info("TDM: load model from {}".format(init_model_path)) fleet.run_server() context['is_exit'] = True
def main(args): log.info("start") worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) num_devices = int(os.getenv("CPU_NUM", 10)) model = Metapath2vecModel(config=args) pyreader = model.pyreader loss = model.forward() # init fleet role = role_maker.PaddleCloudRoleMaker() fleet.init(role) train_steps = math.ceil(args.num_nodes * args.epochs / args.batch_size / num_devices / worker_num) log.info("Train step: %s" % train_steps) real_batch_size = args.batch_size * args.walk_len * args.win_size if args.optimizer == "sgd": args.lr *= real_batch_size optimization(args.lr, loss, train_steps, args.optimizer) # init and run server or worker if fleet.is_server(): fleet.init_server(args.warm_start_from_dir) fleet.run_server() if fleet.is_worker(): log.info("start init worker done") fleet.init_worker() #just the worker, load the sample log.info("init worker done") exe = F.Executor(F.CPUPlace()) exe.run(fleet.startup_program) log.info("Startup done") dataset = m2vGraph(args) log.info("Build graph done.") data_generator = multiprocess_data_generator(args, dataset) cur_time = time.time() for idx, _ in enumerate(data_generator()): log.info("iter %s: %s s" % (idx, time.time() - cur_time)) cur_time = time.time() if idx == 100: break pyreader.decorate_tensor_provider(data_generator) pyreader.start() compiled_prog = build_complied_prog(fleet.main_program, loss) train_prog(exe, compiled_prog, loss, pyreader, args, train_steps)
def run_server(self, FLAGS): """ set default run_server """ #TODO: load pre model fleet.init_server(FLAGS.init_pretrain_model) if FLAGS.init_train_params is not None: place = fluid.CPUPlace() self.paddle_env['factory']['net'].init_params(place) logging.info("PServer init success!") fleet.run_server() return True
def init_and_run_ps_worker(self, ckpt_path): # init and run server or worker self.exe = F.Executor(F.CPUPlace()) if tfleet.is_server(): tfleet.init_server() self.warmstart(tfleet.startup_program, path=ckpt_path) tfleet.run_server() exit() if tfleet.is_worker(): log.info("start init worker done") tfleet.init_worker() self.exe.run(tfleet.startup_program)
def run_pserver(self, args): """ run pserver process, you don't need to implement it. Args: args (ArgumentParser): run args to config dist fleet. """ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet if args.role.upper() != "PSERVER": raise ValueError("args role must be PSERVER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.SERVER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) self._set_strategy(args) avg_cost = self.net(args) optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, self.strategy) optimizer.minimize(avg_cost) fleet.init_server(model_dir=args.run_params.get("model_dir", "")) fleet.run_server()
def run_pserver(self, args): """run pserver""" from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet import paddle.fluid as fluid from paddle.fluid.transpiler.ps_dispatcher import RoundRobin from paddle.fluid.transpiler.ps_dispatcher import HashName fluid.default_startup_program().random_seed = 1 fluid.default_main_program().random_seed = 1 if args.role.upper() != "PSERVER": raise ValueError("args role must be PSERVER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.SERVER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) strategy = DistributeTranspilerConfig() strategy.sync_mode = args.run_params["sync_mode"] strategy.async_mode = args.run_params["async_mode"] strategy.mode = "pserver" strategy.slice_var_up = args.run_params['slice_var_up'] strategy.enable_dc_asgd = args.run_params['enable_dc_asgd'] if args.run_params['split_method']: strategy.split_method = HashName strategy.split_method = RoundRobin strategy.wait_port = args.run_params['wait_port'] strategy.runtime_split_send_recv = args.run_params['runtime_split_send_recv'] strategy.use_hierarchical_allreduce = args.run_params['use_hierarchical_allreduce'] #strategy.hierarchical_allreduce_exter_nranks = args.run_params['hierarchical_allreduce_exter_nranks'] #strategy.hierarchical_allreduce_inter_nranks = args.run_params['hierarchical_allreduce_inter_nranks'] strategy.geo_sgd_mode = args.run_params['geo_sgd'] strategy.geo_sgd_need_push_nums = args.run_params['push_nums'] avg_cost = self.net(args) optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_server() fleet.run_server()
def train(use_cuda, train_sample_dir, test_sample_dir, old_model, output_model, is_local, is_increment): """ train """ # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model() model_args = model() navi_predict = model_args['predict'][0] voice_navi_predict = model_args['predict'][1] speed_navi_predict = model_args['predict'][2] avg_cost = model_args['avg_cost'] feed_order = model_args['feed_order'] role = role_maker.PaddleCloudRoleMaker() # 全异步训练 config = DistributeTranspilerConfig() config.sync_mode = False config.runtime_split_send_recv = True sgd_optimizer = AdamOptimizer(learning_rate=2e-4) if is_local: sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) # train_reader = paddle.batch( # paddle.reader.shuffle( # streaming_data_reader(), buf_size=8192), batch_size=BATCH_SIZE) feeder = fluid.DataFeeder(feed_order, place) train_reader = feeder.decorate_reader(paddle.batch( paddle.reader.shuffle(streaming_data_reader(), buf_size=8192), batch_size=BATCH_SIZE), multi_devices=False, drop_last=True) start_program = fluid.default_startup_program() exe.run(start_program) main_program = fluid.default_main_program() if is_increment: # load model to fine-tune fluid.io.load_params(exe, old_model, main_program) # for auc_state in model_args['auc'][2]: # set_zero(place, fluid.global_scope(), auc_state.name) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM main_program.num_threads = CPU_NUM build_strategy = fluid.BuildStrategy() build_strategy.async_mode = True # 并行训练,速度更快 train_pe = fluid.ParallelExecutor(use_cuda=use_cuda, main_program=main_program, loss_name=avg_cost.name) cost_list = [] for pass_id in range(PASS_NUM): for batch_id, data in enumerate(train_reader()): cost_value = train_pe.run(feed=data, fetch_list=[avg_cost.name]) cost_list.append(np.array(cost_value)) if batch_id % 100 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s" % \ (pass_id, batch_id, np.array(cost_list).mean()) cost_list = [] if batch_id % 2000 == 0: if output_model is not None: fluid.io.save_inference_model( output_model, feed_order, [ navi_predict, voice_navi_predict, speed_navi_predict, avg_cost ], exe) fluid.io.save_persistables(exe, output_model) infer(test_sample_dir, output_model, feed_order) else: # 加入 fleet init 初始化环境 fleet.init(role) # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化 optimizer = fleet.distributed_optimizer(sgd_optimizer, config) optimizer.minimize(avg_cost) if fleet.is_server(): if is_increment: fleet.init_server(old_model) else: fleet.init_server() fleet.run_server() # 启动worker if fleet.is_worker(): # 初始化worker配置 fleet.init_worker() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) # train_reader = paddle.batch( # paddle.reader.shuffle( # data_reader(train_sample_dir), buf_size=8192), batch_size=BATCH_SIZE) feeder = fluid.DataFeeder(feed_order, place) train_reader = feeder.decorate_reader(paddle.batch( paddle.reader.shuffle(data_reader(train_sample_dir), buf_size=8192), batch_size=BATCH_SIZE), multi_devices=False, drop_last=True) exe.run(fleet.startup_program) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM build_strategy = fluid.BuildStrategy() build_strategy.async_mode = True if CPU_NUM > 1: build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce compiled_prog = fluid.compiler.CompiledProgram( fleet.main_program).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) cost_list = [] for pass_id in range(PASS_NUM): for batch_id, data in enumerate(train_reader()): cost_value = exe.run(program=compiled_prog, feed=data, fetch_list=[avg_cost.name]) cost_list.append(np.array(cost_value)) if batch_id % 100 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s" % \ (pass_id, batch_id, np.array(cost_list).mean()) cost_list = [] if batch_id % 1000 == 0 and fleet.is_first_worker(): if output_model is not None: fleet.save_inference_model( exe, output_model, feed_order, [ navi_predict, voice_navi_predict, speed_navi_predict, avg_cost ]) fleet.save_persistables(exe, output_model) infer(test_sample_dir, output_model, feed_order) fleet.stop_worker()
def train(args): """run train""" # set random program = fluid.default_main_program() program.random_seed = args.random_seed # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色 # 然后使用 fleet api的 init()方法初始化这个节点 role = role_maker.PaddleCloudRoleMaker() fleet.init(role) # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置 # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点 if args.sync_mode == "sync": strategy = StrategyFactory.create_sync_strategy() elif args.sync_mode == "half_async": strategy = StrategyFactory.create_half_async_strategy() elif args.sync_mode == "async": strategy = StrategyFactory.create_async_strategy() # set model logger.info("TDM Begin build network.") tdm_model = TdmTrainNet(args) inputs = tdm_model.input_data() logger.info("TDM Begin load tree travel & layer.") avg_cost, acc = tdm_model.tdm(inputs) logger.info("TDM End build network.") # 配置分布式的optimizer,传入我们指定的strategy,构建program optimizer = fluid.optimizer.AdamOptimizer(learning_rate=args.learning_rate, lazy_mode=True) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) logger.info("TDM End append backward.") # 根据节点角色,分别运行不同的逻辑 if fleet.is_server(): logger.info("TDM Run server ...") # 初始化及运行参数服务器节点 logger.info("TDM init model path: {}".format( args.init_model_files_path)) # 模型中除了tdm树结构相关的变量都应该在此处初始化 fleet.init_server(args.init_model_files_path) lr = fluid.global_scope().find_var("learning_rate_0") if lr: lr.get_tensor().set( np.array(args.learning_rate).astype('float32'), fluid.CPUPlace()) logger.info("TDM Set learning rate {}".format(args.learning_rate)) else: logger.info("TDM Didn't find learning_rate_0 param") logger.info("TDM load End") fleet.run_server() logger.info("TDM Run server success!") elif fleet.is_worker(): logger.info("TDM Run worker ...") # 初始化工作节点 fleet.init_worker() place = fluid.CPUPlace() exe = fluid.Executor(place) logger.info("TDM Run Startup Begin") # 初始化含有分布式流程的fleet.startup_program exe.run(fleet.startup_program) # Set Learning Rate lr = fluid.global_scope().find_var("learning_rate_0") if lr: lr.get_tensor().set( np.array(args.learning_rate).astype('float32'), place) logger.info("TDM Set learning rate {}".format(args.learning_rate)) # Set TDM Variable logger.info("TDM Begin load parameter.") # Set TDM_Tree_Info # 树结构相关的变量不参与网络更新,不存储于参数服务器,因此需要在本地手动Set tdm_param_prepare_dict = tdm_sampler_prepare(args) tdm_param_prepare_dict['info_array'] = tdm_child_prepare(args) Numpy_model = {} Numpy_model['TDM_Tree_Travel'] = tdm_param_prepare_dict['travel_array'] Numpy_model['TDM_Tree_Layer'] = tdm_param_prepare_dict['layer_array'] Numpy_model['TDM_Tree_Info'] = tdm_param_prepare_dict['info_array'] # Numpy_model['TDM_Tree_Emb'] = tdm_emb_prepare(args) # 分布式训练中,Emb存储与参数服务器,无需在本地set for param_name in Numpy_model: param_t = fluid.global_scope().find_var(param_name).get_tensor() param_t.set(Numpy_model[str(param_name)].astype('int32'), place) logger.info("TDM Run Startup End") # Train loop dataset, file_list, example_num = get_dataset(inputs, args) logger.info("TDM Distributed training begin ...") for epoch in range(args.epoch_num): # local shuffle random.shuffle(file_list) dataset.set_filelist(file_list) # 训练节点运行的是经过分布式裁剪的fleet.mian_program start_time = time.time() exe.train_from_dataset(program=fleet.main_program, dataset=dataset, fetch_list=[acc, avg_cost], fetch_info=[ "Epoch {} acc ".format(epoch), "Epoch {} loss ".format(epoch) ], print_period=1, debug=False) end_time = time.time() logger.info( "Epoch {} finished, use time {} second, speed {} example/s". format(epoch, end_time - start_time, example_num * 1.0 / (end_time - start_time))) # 默认使用0号节点保存模型 if fleet.is_first_worker(): model_path = os.path.join(args.model_files_path, "epoch_" + str(epoch)) fleet.save_persistables(executor=exe, dirname=model_path) logger.info("Begin upload files") # upload_files(model_path, warm_up=False) # 在分布式环境下时,支持上传模型到hdfs logger.info("TDM Before stop worker") fleet.stop_worker() logger.info("TDM Distributed training success!")
train_filelist = [ "{}{}".format(input_folder, f) for f in output.decode('ascii').strip().split('\n') ] role = role_maker.PaddleCloudRoleMaker() fleet.init(role) config = DistributeTranspilerConfig() config.sync_mode = False optimizer = fleet.distributed_optimizer(optimizer, config) optimizer.minimize(avg_cost) if fleet.is_server(): fleet.init_server() fleet.run_server() elif fleet.is_worker(): place = fluid.CPUPlace() exe = fluid.Executor(place) fleet.init_worker() exe.run(fluid.default_startup_program()) print("startup program done.") fleet_filelist = fleet.split_files(train_filelist) dataset.set_filelist(fleet_filelist) exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=[auc_var], fetch_info=["auc"], debug=True) print("end .... ") # save model here
def train(args): datas, avg_cost, predict, train_file_path = model() endpoints = args.endpoints.split(",") if args.role.upper() == "PSERVER": current_id = endpoints.index(args.current_endpoint) else: current_id = 0 role = role_maker.UserDefinedRoleMaker( current_id=current_id, role=role_maker.Role.WORKER if args.role.upper() == "TRAINER" else role_maker.Role.SERVER, worker_num=args.trainers, server_endpoints=endpoints) exe = fluid.Executor(fluid.CPUPlace()) fleet.init(role) strategy = DistributeTranspilerConfig() strategy.sync_mode = False optimizer = fluid.optimizer.SGD(learning_rate=0.0001) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) if fleet.is_server(): logger.info("run pserver") fleet.init_server() fleet.run_server() elif fleet.is_worker(): logger.info("run trainer") fleet.init_worker() exe.run(fleet.startup_program) thread_num = 2 filelist = [] for _ in range(thread_num): filelist.append(train_file_path) # config dataset dataset = fluid.DatasetFactory().create_dataset() dataset.set_batch_size(128) dataset.set_use_var(datas) pipe_command = 'python ctr_dataset_reader.py' dataset.set_pipe_command(pipe_command) dataset.set_filelist(filelist) dataset.set_thread(thread_num) for epoch_id in range(10): logger.info("epoch {} start".format(epoch_id)) pass_start = time.time() dataset.set_filelist(filelist) exe.train_from_dataset( program=fleet.main_program, dataset=dataset, fetch_list=[avg_cost], fetch_info=["cost"], print_period=100, debug=False) pass_time = time.time() - pass_start logger.info("epoch {} finished, pass_time {}".format(epoch_id, pass_time)) fleet.stop_worker()
def train(use_cuda, save_dirname, is_local, is_increment): """ train """ # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model() old_model = None model_args = model() predict = model_args['predict'] avg_cost = model_args['avg_cost'] feed_order = model_args['feed_order'] loader = model_args['loader'] auc_batch = model_args['auc'][1] # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化 sgd_optimizer = AdamOptimizer(learning_rate=2e-4) # sgd_optimizer = fluid.optimizer.Adam(learning_rate=2e-5) if is_local: sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) readers = [] for i in range(16): readers.append(data_reader(cluster_train_dir)) multi_readers = paddle.reader.multiprocess_reader(readers) loader.set_sample_generator( multi_readers, batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM)) # data_reader(cluster_train_dir), batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM)) # feeder = fluid.DataFeeder(feed_order, place) # train_reader = feeder.decorate_reader( # paddle.batch(paddle.reader.shuffle( # data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE), # multi_devices=False, drop_last=True) start_program = fluid.default_startup_program() exe.run(start_program) main_prog = fluid.default_main_program() exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM * 2 build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce # cpu reduce faster build_strategy.fuse_broadcast_ops = True # build_strategy.async_mode = True main_program = fluid.CompiledProgram(main_prog).with_data_parallel( loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy) #loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy, places=fluid.cpu_places(CPU_NUM)) if is_increment: # load model to fine-tune fluid.io.load_params(exe, old_model, main_program) for auc_state in model_args['auc'][2]: set_zero(place, fluid.global_scope(), auc_state.name) # 并行训练,速度更快 # train_pe = fluid.ParallelExecutor(use_cuda=use_cuda, # main_program=main_program, loss_name=avg_cost.name, # exec_strategy=exec_strategy, build_strategy=build_strategy) cost_list = [] auc_list = [] import time pass_s_time = time.time() for pass_id in range(PASS_NUM): s_time = time.time() for batch_id, data in enumerate(loader()): r_time = time.time() - s_time st_time = time.time() cost_value, auc_value = exe.run( program=main_program, feed=data, fetch_list=[avg_cost.name, auc_batch.name]) t_time = time.time() - st_time cost_list.append(np.array(cost_value)) auc_list.append(np.array(auc_value)) if batch_id % 10 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s auc %s readtime %f triantime %f" % \ (pass_id, batch_id, np.array(cost_list).mean(), np.array(auc_list).mean(), r_time, t_time) cost_list = [] auc_list = [] if batch_id % 1000 == 0: if save_dirname is not None: fluid.io.save_inference_model( save_dirname, feed_order, [predict, avg_cost, auc_batch], exe ) fluid.io.save_persistables(exe, save_dirname) infer(cluster_test_dir, save_dirname, feed_order) s_time = time.time() pass_time = time.time() - pass_s_time print("Pass train time: %f" % pass_time) else: role = role_maker.PaddleCloudRoleMaker() # 全异步训练 config = DistributeTranspilerConfig() config.sync_mode = False config.runtime_split_send_recv = True # 加入 fleet init 初始化环境 fleet.init(role) optimizer = fleet.distributed_optimizer(sgd_optimizer, config) optimizer.minimize(avg_cost) if fleet.is_server(): fleet.init_server() fleet.run_server() # 启动worker if fleet.is_worker(): # 初始化worker配置 fleet.init_worker() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) feeder = fluid.DataFeeder(feed_order, place) train_reader = feeder.decorate_reader( paddle.batch(paddle.reader.shuffle( data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE), multi_devices=False, drop_last=True) exe.run(fleet.startup_program) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = CPU_NUM build_strategy = fluid.BuildStrategy() build_strategy.async_mode = True if CPU_NUM > 1: build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce compiled_prog = fluid.compiler.CompiledProgram( fleet.main_program).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) for pass_id in range(PASS_NUM): cost_list = [] auc_list = [] import time s_time = time.time() for batch_id, data in enumerate(train_reader()): r_time = time.time() - s_time cost_value, auc_value = exe.run( program=compiled_prog, feed=data, fetch_list=[avg_cost.name, auc_batch.name]) t_time = time.time() - r_time cost_list.append(np.array(cost_value)) auc_list.append(np.array(auc_value)) if batch_id % 10 == 0 and batch_id != 0: print "Pass %d, batch %d, cost %s auc %s readtime %f traintime %f" % \ (pass_id, batch_id, np.array(cost_list).mean(), np.array(auc_list).mean(), r_time, t_time) cost_list = [] auc_list = [] if batch_id % 1000 == 0 and fleet.is_first_worker(): if save_dirname is not None: fleet.save_inference_model( exe, save_dirname, feed_order, [predict, avg_cost, auc_batch] ) fleet.save_persistables(exe, save_dirname) infer(cluster_test_dir, save_dirname, feed_order) s_time = time.time() fleet.stop_worker()
def test_pslib_2(self): """Test cases for pslib.""" import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINERS_NUM"] = "1" place = fluid.CPUPlace() exe = fluid.Executor(place) try: fleet.init(None) except: print("no mpi4py, skip test_pslib_2") return train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() with fluid.program_guard(train_program, startup_program): show = fluid.layers.data(name="show", shape=[-1, 1], \ dtype="float32", lod_level=1, append_batch_size=False) fc = fluid.layers.fc(input=show, size=1, act=None) label = fluid.layers.data(name="click", shape=[-1, 1], \ dtype="int64", lod_level=1, append_batch_size=False) label_cast = fluid.layers.cast(label, dtype='float32') cost = fluid.layers.log_loss(fc, label_cast) try: adam = fluid.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) adam.minimize([cost], [scope]) fleet.run_server() except: print("do not support pslib test, skip") return os.environ["TRAINING_ROLE"] = "wrong" try: role1 = GeneralRoleMaker(path="./test_gloo_1") role1.generate_role() except: print("catch expected error of wrong TRAINING_ROLE") os.environ["TRAINING_ROLE"] = "PSERVER" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" role2 = GeneralRoleMaker(path="./test_gloo_2") role2._finalize() role2._all_gather(1) role2._all_gather(1) role2._barrier_server() role2._all_gather(1) role3 = GeneralRoleMaker(path="./test_gloo_3") role3._worker_gather(1) role3._worker_gather(1) os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002" role4 = GeneralRoleMaker(path="./test_gloo_4") role4._worker_gather(1) role4._get_rank() role4._get_size() role4._all_comm.init() role5 = GeneralRoleMaker(path="./test_gloo_5") role5.get_local_endpoint() role5.get_local_endpoint() role6 = GeneralRoleMaker(path="./test_gloo_6") role6.get_trainer_endpoints() role6.get_trainer_endpoints() role7 = GeneralRoleMaker(path="./test_gloo_7") role7.get_pserver_endpoints() role7.get_pserver_endpoints() role8 = GeneralRoleMaker(path="./test_gloo_8") role8.is_worker() role8.is_worker() role9 = GeneralRoleMaker(path="./test_gloo_9") role9.is_server() role9.is_server() role10 = GeneralRoleMaker(path="./test_gloo_10") role10.is_first_worker() role10.is_first_worker() role11 = GeneralRoleMaker(path="./test_gloo_11") role11.worker_index() role11.worker_index() role12 = GeneralRoleMaker(path="./test_gloo_12") role12.server_index() role12.server_index() role13 = GeneralRoleMaker(path="./test_gloo_13") role13.worker_num() role13.worker_num() role14 = GeneralRoleMaker(path="./test_gloo_14") role14.server_num() role14.server_num() role15 = GeneralRoleMaker(path="./test_gloo_15") role15._barrier_worker() role15._barrier_worker() role16 = GeneralRoleMaker(path="./test_gloo_16") role16._barrier_all() role16._barrier_all() role17 = GeneralRoleMaker(path="./test_gloo_17") role17._barrier_server() role17._barrier_server() role18 = GeneralRoleMaker(path="./test_gloo_18") role18._worker_num() role18._worker_num() role19 = GeneralRoleMaker(path="./test_gloo_19") role19._server_num() role19._server_num() role20 = GeneralRoleMaker(path="./test_gloo_20") a = [1] b = [0] role20._all_reduce(a, b) role21 = GeneralRoleMaker(path="./test_gloo_21") role21.all_reduce_worker([], []) role21.all_reduce_worker([], []) role21.barrier_worker() role21.barrier_all() role22 = GeneralRoleMaker(path="./test_gloo_22") role22._get_rank() role22._get_rank() os.environ["PADDLE_PSERVER_ID"] = "0" role23 = GeneralRoleMaker(path="./test_gloo_23") role23._get_size() role23._get_size() with open("test_fleet_gloo_role_maker_1.txt", "w") as f: data = "1 1 1 1\n" f.write(data) dataset = paddle.distributed.InMemoryDataset() dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"]) dataset._set_use_var([show, label]) dataset.load_into_memory() dataset.get_memory_data_size(fleet) dataset.get_shuffle_data_size(fleet) os.remove("./test_fleet_gloo_role_maker_1.txt") class TmpClass(): """ dummy tmp class """ def __init__(self): pass def all_reduce_worker(self, input, output): """ dummy all reduce worker Args: input(None): fake input output(None): fale output """ pass def barrier_worker(self): """ dummy barrier worker """ pass from paddle.fluid.incubate.fleet.base.fleet_base import Fleet class TmpFleet(Fleet): """ dummy tmp fleet """ def __init__(self): super(TmpFleet, self).__init__() self._role_maker = None def init_worker(self): """ dummy init worker """ pass def init_server(self, model_dir=None): """ dummy init server Args: model_dir(None): fake model_dir """ pass def run_server(self): """ dummy run server """ pass def stop_worker(self): """ dummy stop worker """ pass def distributed_optimizer(self, optimizer, strategy=None): """ dummy distributed optimizer Args: optimizer(None): fake optimizer strategy(None): fake strategy """ pass def save_inference_model(self): """ dummy save inference model """ pass def save_persistables(self): """ dummy save persistables """ pass os.environ["TRAINING_ROLE"] = "TRAINER" tmp = TmpFleet() tmp._role_maker = TmpClass() tmp.all_reduce_worker([], []) tmp.barrier_worker() from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker tmp = RoleMakerBase() tmp.all_gather(1) tmp.all_reduce_worker([], []) tmp.barrier_worker() tmp.barrier_all() from paddle.fluid.incubate.fleet.base.role_maker import \ MPISymetricRoleMaker tmp1 = MPISymetricRoleMaker() tmp1.all_gather(1) tmp1.all_gather(1) tmp2 = MPISymetricRoleMaker() tmp2.all_reduce_worker([], []) tmp3 = MPISymetricRoleMaker() tmp3.barrier_worker() tmp3.barrier_worker() tmp4 = MPISymetricRoleMaker() tmp4.barrier_all() tmp4.barrier_all()
def fit(): role = role_maker.UserDefinedRoleMaker( current_id=current_id, role=role_maker.Role.WORKER if bool(1==int(roles)) else role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011"]) fleet.init(role) BATCH_SIZE = 128 type_size=createDataList(in_file_path,in_file_path+'.data'+"/") # 用于训练的数据提供器 train_reader=paddle.batch(reader=paddle.reader.shuffle(reader=dataReader(in_file_path+".data/trainer.list"),buf_size=BATCH_SIZE*100), batch_size=BATCH_SIZE) test_reader=paddle.batch(reader=paddle.reader.shuffle(reader=dataReader(in_file_path+".data/test.list"),buf_size=BATCH_SIZE*100), batch_size=BATCH_SIZE) data_shape = [3, 32, 32] images = fluid.layers.data(name='images', shape=data_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') # 获取分类器 predict = networkConfiguration(images,type_size) # 定义损失函数和准确率 cost = fluid.layers.cross_entropy(input=predict, label=label) # 交叉熵 avg_cost = fluid.layers.mean(cost) # 计算cost中所有元素的平均值 acc = fluid.layers.accuracy(input=predict, label=label) # 使用输入和标签计算准确率 # 定义优化方法 test_program = fluid.default_main_program().clone(for_test=True) # 获取测试程序 optimizer = fluid.optimizer.Adam(learning_rate=0.001) strategy = DistributeTranspilerConfig() strategy.sync_mode = True optimizer = fleet.distributed_optimizer(optimizer,strategy) # 定义优化方法 optimizer.minimize(avg_cost) if fleet.is_server(): print("启动server") fleet.init_server() fleet.run_server() elif fleet.is_worker(): print("启动worker") fleet.init_worker() print(fleet.worker_endpoints()) ########## 模型训练&模型评估 ########## # 创建Executor use_cuda = False # 定义使用CPU还是GPU,使用CPU时use_cuda=False place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() print("cpu") # 定义数据映射器 feeder = fluid.DataFeeder(feed_list=[images, label], place=place) print("数据映射") exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) for pass_id in range(EPOCH_NUM): print(pass_id) # 开始训练 for batch_id, data in enumerate(train_reader()): # 遍历train_reader train_cost, train_acc = exe.run(program=fluid.default_main_program(), # 运行主程序 feed=feeder.feed(data), # 喂入一个batch的数据 fetch_list=[avg_cost, acc]) # fetch均方误差和准确率 # fetch均方误差和准确率 # 每100次batch打印一次训练、进行一次测试 if batch_id % 20 == 0: print('Pass:%d, Batch:%d, Cost:%0.5f, Accuracy:%0.5f' %(pass_id, batch_id, train_cost[0], train_acc[0])) # 开始测试 test_costs = [] # 测试的损失值 test_accs = [] # 测试的准确率 for batch_id, data in enumerate(test_reader()): test_cost, test_acc = exe.run(program=test_program, # 执行训练程序 feed=feeder.feed(data), # 喂入数据 fetch_list=[avg_cost, acc]) # fetch误差、准确率 test_costs.append(test_cost[0]) # 记录每个batch的损失值 test_accs.append(test_acc[0]) # 记录每个batch的准确率 test_cost = (sum(test_costs) / len(test_costs)) # 计算误差平均值 test_acc = (sum(test_accs) / len(test_accs)) # 计算准确率平均值 print('Test:%d, Cost:%0.5f, ACC:%0.5f' % (pass_id, test_cost, test_acc)) save(predict,model_file_path,exe) fleet.stop_worker()
def train(args): import logging log.setLevel(logging.DEBUG) log.info("start") worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) num_devices = int(os.getenv("CPU_NUM", 10)) model = DeepwalkModel(args.num_nodes, args.hidden_size, args.neg_num, args.is_sparse, args.is_distributed, 1.) pyreader = model.pyreader loss = model.forward() # init fleet init_role() train_steps = math.ceil(1. * args.num_nodes * args.epoch / args.batch_size / num_devices / worker_num) log.info("Train step: %s" % train_steps) if args.optimizer == "sgd": args.lr *= args.batch_size * args.walk_len * args.win_size optimization(args.lr, loss, train_steps, args.optimizer) # init and run server or worker if fleet.is_server(): fleet.init_server(args.warm_start_from_dir) fleet.run_server() if fleet.is_worker(): log.info("start init worker done") fleet.init_worker() #just the worker, load the sample log.info("init worker done") exe = F.Executor(F.CPUPlace()) exe.run(fleet.startup_program) log.info("Startup done") if args.dataset is not None: if args.dataset == "BlogCatalog": graph = data_loader.BlogCatalogDataset().graph elif args.dataset == "ArXiv": graph = data_loader.ArXivDataset().graph else: raise ValueError(args.dataset + " dataset doesn't exists") log.info("Load buildin BlogCatalog dataset done.") elif args.walkpath_files is None or args.walkpath_files == "None": graph = build_graph(args.num_nodes, args.edge_path) log.info("Load graph from '%s' done." % args.edge_path) else: graph = build_fake_graph(args.num_nodes) log.info("Load fake graph done.") # bind gen gen_func = build_gen_func(args, graph) pyreader.decorate_tensor_provider(gen_func) pyreader.start() compiled_prog = build_complied_prog(fleet.main_program, loss) train_prog(exe, compiled_prog, loss, pyreader, args, train_steps)
def server(self, context): fleet.init_server() fleet.run_server() context['is_exit'] = True