def distribute_train(args): # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色 # 然后使用 fleet api的 init()方法初始化这个节点 role = role_maker.PaddleCloudRoleMaker() fleet.init(role) # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置 # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点 strategy = DistributeTranspilerConfig() strategy.sync_mode = False strategy.runtime_split_send_recv = True ctr_model = CTR() inputs = ctr_model.input_data(args) avg_cost, auc_var = ctr_model.net(inputs, args) # 配置分布式的optimizer,传入我们指定的strategy,构建program optimizer = fluid.optimizer.Adam(args.learning_rate) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) # 根据节点角色,分别运行不同的逻辑 if fleet.is_server(): # 初始化及运行参数服务器节点 fleet.init_server() fleet.run_server() elif fleet.is_worker(): # 初始化工作节点 fleet.init_worker() exe = fluid.Executor(fluid.CPUPlace()) # 初始化含有分布式流程的fleet.startup_program exe.run(fleet.startup_program) dataset, file_list = get_dataset(inputs, args) for epoch in range(args.epochs): # 以文件为粒度进行shuffle random.shuffle(file_list) dataset.set_filelist(file_list) # 训练节点运行的是经过分布式裁剪的fleet.mian_program start_time = time.time() exe.train_from_dataset(program=fleet.main_program, dataset=dataset, fetch_list=[auc_var], fetch_info=["Epoch {} auc ".format(epoch)], print_period=100, debug=False) end_time = time.time() logger.info("epoch %d finished, use time=%d\n" % ((epoch), end_time - start_time)) # 默认使用0号节点保存模型 if args.save_model and fleet.is_first_worker(): model_path = os.path.join(str(args.model_path), "epoch_" + str(epoch)) fleet.save_persistables(executor=exe, dirname=model_path) fleet.stop_worker() logger.info("Distribute Train Success!")
def test(self): endpoints = [ "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006", "127.0.0.1:36007" ] role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=endpoints) fleet.init(role) loss, acc, _ = self.net() optimizer = fluid.optimizer.Adagrad( learning_rate=fluid.layers.exponential_decay( learning_rate=base_lr, decay_steps=500, decay_rate=0.969, staircase=True)) strategy = StrategyFactory.create_async_strategy() optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(loss)
def run_pserver(self, args): if args.role.upper() != "PSERVER": raise ValueError("args role must be PSERVER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.SERVER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) strategy = DistributeTranspilerConfig() strategy.sync_mode = args.sync_mode strategy.geo_sgd_mode = args.geo_sgd_mode strategy.geo_sgd_need_push_nums = args.geo_sgd_need_push_nums avg_cost = self.net() optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_server() fleet.run_server()
def test_pserver(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) fleet.init(role) batch_size = 128 is_sparse = True is_distribute = False strategy = DistributeTranspilerConfig() strategy.sync_mode = False strategy.geo_sgd_mode = True strategy.geo_sgd_need_push_nums = 5 avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse) optimizer = fluid.optimizer.SGD(0.1) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) pserver_startup_program = fleet.startup_program pserver_mian_program = fleet.main_program
def instance(self, context): from paddle.fluid.incubate.fleet.collective import fleet from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker role = PaddleCloudRoleMaker(is_collective=True) fleet.init(role) context['fleet'] = fleet context['status'] = 'network_pass'
def init_role(): # reset the place according to role of parameter server training_role = os.getenv("TRAINING_ROLE", "TRAINER") paddle_role = role_maker.Role.WORKER place = F.CPUPlace() if training_role == "PSERVER": paddle_role = role_maker.Role.SERVER # set the fleet runtime environment according to configure ports = os.getenv("PADDLE_PORT", "6174").split(",") pserver_ips = os.getenv("PADDLE_PSERVERS").split(",") # ip,ip... eplist = [] if len(ports) > 1: # local debug mode, multi port for port in ports: eplist.append(':'.join([pserver_ips[0], port])) else: # distributed mode, multi ip for ip in pserver_ips: eplist.append(':'.join([ip, ports[0]])) pserver_endpoints = eplist # ip:port,ip:port... worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) role = role_maker.UserDefinedRoleMaker(current_id=trainer_id, role=paddle_role, worker_num=worker_num, server_endpoints=pserver_endpoints) fleet.init(role)
def run_trainer(self, args): """ run trainer process, you don't need to implement it. Args: args (ArgumentParser): run args to config dist fleet. """ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet if args.role.upper() != "TRAINER": raise ValueError("args role must be TRAINER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.WORKER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) self._set_strategy(args) avg_cost = self.net(args) optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, self.strategy) optimizer.minimize(avg_cost) if args.run_params.get("run_from_dataset", False): losses = self.do_training_from_dataset(fleet, args) else: losses = self.do_training(fleet, args) losses = "" if not losses else losses print(losses)
def test_fleet_barrier(self): role = role_maker.UserDefinedRoleMaker(current_id=0, role=role_maker.Role.WORKER, worker_num=1, server_endpoints=['127.0.0.1']) fleet.init(role) check_all_trainers_ready("/ready_path/", 0)
def instance(self, context): from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker role = PaddleCloudRoleMaker() fleet.init(role) context['fleet'] = fleet context['status'] = 'network_pass'
def run_pserver(self, args): fleet.init(self.build_role(args)) strategy = self.build_strategy(args) avg_cost = self.net(args) self.build_optimizer(avg_cost, strategy) fleet.init_server() fleet.run_server()
def append_additional_args(self, FLAGS): """ append addtional args from the existing args """ #dataset_dir and train_dir is defined in padllecloud, cannot be set by user role = role_maker.PaddleCloudRoleMaker() fleet.init(role) return super(PaddleCloudFleetTrainer, self).append_additional_args(FLAGS)
def run_pserver(self, role, strategy): fleet.init(role) avg_cost, x, y = self.net() optimizer = fluid.optimizer.SGD(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_server() fleet.run_server()
def test_default_strategy(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) optimizer = fluid.optimizer.SGD(0.0001) optimizer = fleet.distributed_optimizer(optimizer)
def main(args): log.info("start") worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) num_devices = int(os.getenv("CPU_NUM", 10)) model = Metapath2vecModel(config=args) pyreader = model.pyreader loss = model.forward() # init fleet role = role_maker.PaddleCloudRoleMaker() fleet.init(role) train_steps = math.ceil(args.num_nodes * args.epochs / args.batch_size / num_devices / worker_num) log.info("Train step: %s" % train_steps) real_batch_size = args.batch_size * args.walk_len * args.win_size if args.optimizer == "sgd": args.lr *= real_batch_size optimization(args.lr, loss, train_steps, args.optimizer) # init and run server or worker if fleet.is_server(): fleet.init_server(args.warm_start_from_dir) fleet.run_server() if fleet.is_worker(): log.info("start init worker done") fleet.init_worker() #just the worker, load the sample log.info("init worker done") exe = F.Executor(F.CPUPlace()) exe.run(fleet.startup_program) log.info("Startup done") dataset = m2vGraph(args) log.info("Build graph done.") data_generator = multiprocess_data_generator(args, dataset) cur_time = time.time() for idx, _ in enumerate(data_generator()): log.info("iter %s: %s s" % (idx, time.time() - cur_time)) cur_time = time.time() if idx == 100: break pyreader.decorate_tensor_provider(data_generator) pyreader.start() compiled_prog = build_complied_prog(fleet.main_program, loss) train_prog(exe, compiled_prog, loss, pyreader, args, train_steps)
def test_dist_geo_server_transpiler(self): num_voc = 128 embed_dim = 64 x_shape, x_lod = [16, 10], [[3, 5, 2, 6]] x = fluid.data(name='x', shape=x_shape, dtype='int32', lod_level=1) hash_embd = fluid.contrib.layers.search_pyramid_hash( input=x, num_emb=embed_dim, space_len=num_voc * embed_dim, pyramid_layer=4, rand_len=16, drop_out_percent=0.5, is_training=True, use_filter=False, white_list_len=6400, black_list_len=2800, seed=3, lr=0.002, param_attr=fluid.ParamAttr( name="PyramidHash_emb_0", learning_rate=0, ), param_attr_wl=fluid.ParamAttr( name="Filter", learning_rate=0, ), param_attr_bl=None, distribute_update_vars=["PyramidHash_emb_0"], name=None) cost = fluid.layers.reduce_sum(hash_embd) role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) fleet.init(role) strategy = DistributeTranspilerConfig() strategy.sync_mode = False strategy.geo_sgd_mode = True strategy.geo_sgd_need_push_nums = 5 optimizer = fluid.optimizer.SGD(0.1) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(cost) pserver_startup_program = fleet.startup_program pserver_mian_program = fleet.main_program
def test_half_async_strategy(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) half_async_config = DistributeTranspilerConfig() half_async_config.sync_mode = False half_async_config.geo_sgd_mode = False half_async_config.runtime_split_send_recv = False optimizer = fluid.optimizer.SGD(0.0001) optimizer = fleet.distributed_optimizer(optimizer, half_async_config)
def test(self): endpoints = [ "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006", "127.0.0.1:36007" ] role = role_maker.UserDefinedRoleMaker(current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=endpoints) fleet.init(role) loss, acc, _ = self.net() optimizer = fluid.optimizer.SGD(base_lr) strategy = StrategyFactory.create_geo_strategy(20) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(loss)
def processor_register(self): role = PaddleCloudRoleMaker() fleet.init(role) if fleet.is_server(): self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) self.regist_context_processor('server_pass', self.server) else: self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) if envs.get_platform() == "LINUX": self.regist_context_processor('train_pass', self.dataset_train) else: self.regist_context_processor('train_pass', self.dataloader_train) self.regist_context_processor('terminal_pass', self.terminal)
def test_communicator_async(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) avg_cost = self.net() optimizer = fluid.optimizer.SGD(0.01) strategy = StrategyFactory.create_async_strategy() optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_worker() time.sleep(10) fleet.stop_worker()
def processor_register(self): role = PaddleCloudRoleMaker() fleet.init(role) if fleet.is_server(): self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) self.regist_context_processor('server_pass', self.server) else: self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) self.regist_context_processor('startup_pass', self.startup) if envs.get_platform() == "LINUX" and envs.get_global_env( "dataset_class", None, "train.reader") != "DataLoader": self.regist_context_processor('train_pass', self.dataset_train) else: self.regist_context_processor('train_pass', self.dataloader_train) self.regist_context_processor('infer_pass', self.infer) self.regist_context_processor('terminal_pass', self.terminal)
def test_transpile(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) # for test optimizer without init(role) fleet.init(role) batch_size = 128 is_sparse = True is_distribute = False strategy = DistributeTranspilerConfig() strategy.sync_mode = False strategy.runtime_split_send_recv = True avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse) self.set_program(avg_cost, strategy) strategy.runtime_split_send_recv = False self.set_program(avg_cost, strategy)
def run_trainer(self, role, strategy): place = fluid.core.CPUPlace() exe = fluid.Executor(place) fleet.init(role) avg_cost, x, y = self.net() optimizer = fluid.optimizer.SGD(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) exe.run(fleet.startup_program) fleet.init_worker() train_reader = paddle.batch(self.fake_reader(), batch_size=24) feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) for batch_id, data in enumerate(train_reader()): exe.run(fleet.main_program, feed=feeder.feed(data), fetch_list=[]) fleet.stop_worker()
def run_nccl_trainer(self, args): """run fleet api""" assert args.update_method == "nccl" import paddle.fluid as fluid import six from paddle.fluid.incubate.fleet.collective import fleet exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = args.run_params['num_threads'] #dist_strategy = DistributedStrategy() #dist_strategy.exec_strategy = exec_strategy #dist_strategy.fuse_memory_size = 1 # MB #dist_strategy.fuse_laryer_size = 1 if args.role.upper() != "TRAINER": raise ValueError("args role must be TRAINER") role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) strategy = DistributeTranspilerConfig() avg_cost = self.net(args) losses = self.do_training(fleet,args) losses = "" if not losses else losses print(losses)
def test_communicator_async(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) avg_cost = self.net() optimizer = fluid.optimizer.SGD(0.01) strategy = DistributeTranspilerConfig() strategy.sync_mode = False strategy.runtime_split_send_recv = True strategy.wait_port = False optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_worker() time.sleep(10) fleet.stop_worker()
def run_nccl_trainer(self, args): """ run nccl trainer, used for gpu case. Args: args (ArgumentParser): run args to config dist fleet. """ assert args.update_method == "nccl" from paddle.fluid.incubate.fleet.collective import fleet exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = args.run_params['num_threads'] #dist_strategy = DistributedStrategy() #dist_strategy.exec_strategy = exec_strategy #dist_strategy.fuse_memory_size = 1 # MB #dist_strategy.fuse_laryer_size = 1 if args.role.upper() != "TRAINER": raise ValueError("args role must be TRAINER") role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) avg_cost = self.net(args) losses = self.do_training(fleet, args) losses = "" if not losses else losses print(losses)
def run_pserver(self, args): """ run pserver process, you don't need to implement it. Args: args (ArgumentParser): run args to config dist fleet. """ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet if args.role.upper() != "PSERVER": raise ValueError("args role must be PSERVER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.SERVER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) self._set_strategy(args) avg_cost = self.net(args) optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, self.strategy) optimizer.minimize(avg_cost) fleet.init_server(model_dir=args.run_params.get("model_dir", "")) fleet.run_server()
def run_trainer(self, args): """run trainer""" from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet import paddle.fluid as fluid from paddle.fluid.transpiler.ps_dispatcher import RoundRobin from paddle.fluid.transpiler.ps_dispatcher import HashName fluid.default_startup_program().random_seed = 1 fluid.default_main_program().random_seed = 1 if args.role.upper() != "TRAINER": raise ValueError("args role must be TRAINER") role = role_maker.UserDefinedRoleMaker( current_id=args.current_id, role=role_maker.Role.WORKER, worker_num=args.trainers, server_endpoints=args.endpoints.split(",")) fleet.init(role) strategy = DistributeTranspilerConfig() strategy.sync_mode = args.run_params["sync_mode"] strategy.async_mode = args.run_params["async_mode"] strategy.mode = "pserver" strategy.slice_var_up = args.run_params['slice_var_up'] strategy.enable_dc_asgd = args.run_params['enable_dc_asgd'] if args.run_params['split_method']: strategy.split_method = HashName strategy.split_method = RoundRobin strategy.wait_port = args.run_params['wait_port'] strategy.runtime_split_send_recv = args.run_params['runtime_split_send_recv'] strategy.use_hierarchical_allreduce = args.run_params['use_hierarchical_allreduce'] # strategy.hierarchical_allreduce_exter_nranks = args.run_params['hierarchical_allreduce_exter_nranks'] # strategy.hierarchical_allreduce_inter_nranks = args.run_params['hierarchical_allreduce_inter_nranks'] strategy.geo_sgd_mode = args.run_params['geo_sgd'] strategy.geo_sgd_need_push_nums = args.run_params['push_nums'] avg_cost = self.net() optimizer = fluid.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) losses = self.do_training(fleet, args) losses = "" if not losses else losses print(losses)
def test_debug_info(self): x = fluid.layers.data(name='x', shape=[1], dtype='float32') y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) optimizer = fluid.optimizer.SGD(0.0001) strategy = StrategyFactory.create_sync_strategy() strategy.set_debug_opt({ "dump_param": ["fc_0.tmp_0"], "dump_fields": ["fc_0.tmp_0", "fc_0.tmp_0@GRAD"], "dump_fields_path": "dump_text/" }) optimizer = fleet.distributed_optimizer(optimizer, strategy)
def test_communicator_init_and_start(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) avg_cost = self.net() optimizer = fluid.optimizer.SGD(0.01) strategy = DistributeTranspilerConfig() strategy.sync_mode = True strategy.wait_port = False optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) comm = Communicator(fleet.main_program) comm.start() time.sleep(10) comm.stop()
def __init__(self): training_role = os.getenv("TRAINING_ROLE", "TRAINER") paddle_role = role_maker.Role.WORKER place = F.CPUPlace() if training_role == "PSERVER": paddle_role = role_maker.Role.SERVER # set the fleet runtime environment according to configure port = os.getenv("PADDLE_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVERS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = eplist # ip:port,ip:port... worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) role = role_maker.UserDefinedRoleMaker( current_id=trainer_id, role=paddle_role, worker_num=worker_num, server_endpoints=pserver_endpoints) tfleet.init(role) tfleet.save_on_pserver = True