def test_gradient_merge_optimizer(self): fleet.init(role_maker.PaddleCloudRoleMaker()) x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32') y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32') cost = paddle.fluid.layers.square_error_cost(input=x, label=y) avg_cost = paddle.fluid.layers.mean(cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False strategy.a_sync_configs = {"launch_barrier": False} optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) prog = paddle.fluid.default_main_program() self.assertEqual(prog.global_block().ops[-1].type, "send_barrier") sends = 0 sgds = 0 for op in prog.global_block().ops: if op.type == "send": sends += 1 if op.type == "sgd": sgds += 1 self.assertEqual(sends, 0) self.assertEqual(sgds, 0) fleet.init_worker() time.sleep(8) fleet.stop_worker()
def test_communicator_sync(self): os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_PSERVER_NUMS"] = "2" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ "127.0.0.1:36001,127.0.0.2:36001" fleet.init(role_maker.PaddleCloudRoleMaker()) avg_cost = self.net() optimizer = fluid.optimizer.SGD(0.01) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_worker() time.sleep(10) fleet.stop_worker()
def train(args): import logging log.setLevel(logging.DEBUG) log.info("start") worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) num_devices = int(os.getenv("CPU_NUM", 10)) data = load_raw_edges_fn(args.edge_path, args.undirected) edges = data[0] weights = data[1] node2idx = data[2] num_nodes = len(node2idx) model = DeepwalkModel(num_nodes, args.hidden_size, args.neg_num, args.is_sparse, args.is_distributed, 1.) pyreader = model.pyreader loss = model.forward() # init fleet log.info("init_role") init_role() train_steps = math.ceil(1. * num_nodes * args.epoch / args.batch_size / num_devices / worker_num) log.info("Train step: %s" % train_steps) if args.optimizer == "sgd": args.lr *= args.batch_size * args.walk_len * args.win_size optimization(args.lr, loss, train_steps, args.optimizer) # init and run server or worker if fleet.is_server(): log.info("PS server mode") fleet.init_server() fleet.run_server() if fleet.is_worker(): log.info("start init worker done") exe = F.Executor(F.CPUPlace()) exe.run(F.default_startup_program()) log.info("Startup done") fleet.init_worker() #just the worker, load the sample log.info("init worker done") print("LEO num_nodes:",num_nodes, len(edges)) edges_feat={} edges_feat["weight"] = np.array(weights) graph = pgl.graph.Graph(num_nodes, edges, edge_feat=edges_feat) # bind gen gen_func = build_gen_func(args, graph) pyreader.decorate_tensor_provider(gen_func) train_prog(exe, F.default_main_program(), loss, pyreader, args, train_steps) print("fleet try to stop worker\r\n") fleet.stop_worker() print("Game over\r\n")
def test_gradient_merge_optimizer(self): fleet.init(role_maker.PaddleCloudRoleMaker()) input_x = paddle.fluid.layers.data( name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy( input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) prog = paddle.fluid.default_main_program() self.assertEqual(prog.global_block().ops[-1].type, "send_barrier") sends = 0 sgds = 0 for op in prog.global_block().ops: if op.type == "send": sends += 1 if op.type == "sgd": sgds += 1 self.assertEqual(sends, 6) self.assertEqual(sgds, 0) fleet.init_worker() time.sleep(8) fleet.stop_worker()
def run(self): fleet.init() self.network() if fleet.is_server(): self.run_server() elif fleet.is_worker(): self.run_worker() fleet.stop_worker() logger.info("Run Success, Exit.")
def run(self): self.init_fleet_with_gloo() self.network() if fleet.is_server(): self.run_server() elif fleet.is_worker(): self.run_offline_infer() fleet.stop_worker() # self.record_result() logger.info("Run Success, Exit.")
def main(args): paddle.set_device("cpu") paddle.enable_static() fleet.init() fake_num_nodes = 1 py_reader, loss = StaticSkipGramModel( fake_num_nodes, args.neg_num, args.embed_size, sparse_embedding=True, shared_embedding=args.shared_embedding) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = build_graph(args) # bind gen train_ds = ShardedDataset(graph.nodes, args.epoch) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.cpu_batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) py_reader.set_batch_generator(lambda: data_loader) train_loss = train(exe, paddle.static.default_main_program(), py_reader, loss) fleet.stop_worker() if fleet.is_first_worker(): fleet.save_persistables(exe, "./model", paddle.static.default_main_program())
def main(args): paddle.enable_static() paddle.set_device('gpu:%d' % paddle.distributed.ParallelEnv().dev_id) fleet.init(is_collective=True) graph = load(args.dataset) loss = StaticSkipGramModel(graph.num_nodes, args.neg_num, args.embed_size, num_emb_part=args.num_emb_part, shared_embedding=args.shared_embedding) optimizer = F.optimizer.Adam(args.learning_rate) dist_strategy = fleet.DistributedStrategy() dist_strategy.sharding = True dist_strategy.sharding_configs = { "segment_anchors": None, "sharding_segment_strategy": "segment_broadcast_MB", "segment_broadcast_MB": 32, "sharding_degree": int(paddle.distributed.get_world_size()), } optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id) exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) # bind gen train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) for epoch in range(args.epoch): train_loss = train(exe, paddle.static.default_main_program(), data_loader, loss) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) fleet.stop_worker() if fleet.is_first_worker(): fleet.save_persistables(exe, "./model", paddle.static.default_main_program())
def main(args): paddle.set_device("cpu") paddle.enable_static() role = role_maker.PaddleCloudRoleMaker() fleet.init(role) if args.num_nodes is None: num_nodes = load(args.dataset).num_nodes else: num_nodes = args.num_nodes loss = StaticSkipGramModel( num_nodes, args.neg_num, args.embed_size, sparse=True) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = load(args.dataset) # bind gen train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader( train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) for epoch in range(args.epoch): train_loss = train(exe, paddle.static.default_main_program(), data_loader, loss) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) fleet.stop_worker()
def runtime_main(test_class): parser = argparse.ArgumentParser(description='Run Fleet test.') parser.add_argument('--role', type=str, required=True, choices=['pserver', 'trainer', 'heter_trainer']) parser.add_argument('--endpoints', type=str, required=False, default="") parser.add_argument('--trainer_endpoints', type=str, required=False, default="") parser.add_argument('--heter_trainer_endpoints', type=str, required=False, default="") parser.add_argument('--heter_trainer_device', type=str, required=False, default="gpu") parser.add_argument('--gloo_path', type=str, required=False, default="") parser.add_argument('--current_id', type=int, required=False, default=0) parser.add_argument('--trainers', type=int, required=False, default=1) parser.add_argument('--mode', type=str, required=False, default='async') parser.add_argument('--geo_sgd_need_push_nums', type=int, required=False, default=2) parser.add_argument('--reader', type=str, required=False, default='dataset') args = parser.parse_args() model = test_class() role = model.build_role(args) fleet.init(role) strategy = model.build_strategy(args) avg_cost = model.net(args) model.build_optimizer(avg_cost, strategy) if args.role == "pserver" or args.role == "heter_trainer": model.run_pserver(args) else: if args.reader == "dataset": model.run_dataset_trainer(args) else: model.run_pyreader_trainer(args) fleet.stop_worker()
def test_a_sync_optimizer_trainer(self): os.environ["TRAINING_ROLE"] = "TRAINER" import paddle.distributed.fleet as fleet main_program = paddle.fluid.Program() startup_program = paddle.fluid.Program() paddle.fluid.framework.switch_main_program(main_program) paddle.fluid.framework.switch_startup_program(startup_program) fleet.init(role_maker.PaddleCloudRoleMaker()) input_x = paddle.fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) prog = paddle.fluid.default_main_program() self.assertNotEqual(prog.global_block().ops[-1].type, "send_barrier") sends = 0 sgds = 0 for op in prog.global_block().ops: if op.type == "send": sends += 1 if op.type == "sgd": sgds += 1 self.assertEqual(sends, 7) self.assertEqual(sgds, 0) fleet.init_worker() time.sleep(8) fleet.stop_worker()
def run_trainer(self, role, strategy): place = fluid.core.CPUPlace() exe = fluid.Executor(place) fleet.init(role) avg_cost, x, z, y = self.net() optimizer = fluid.optimizer.SGD(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) exe.run(fluid.default_startup_program()) fleet.init_worker() train_reader = paddle.batch(self.fake_reader(), batch_size=24) feeder = fluid.DataFeeder(place=place, feed_list=[x, z, y]) for batch_id, data in enumerate(train_reader()): exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[]) fleet.stop_worker()
def test_communicator_async(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) avg_cost = self.net() optimizer = fluid.optimizer.SGD(0.01) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True strategy.a_sync_configs = {"launch_barrier": False} optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) os.environ["TEST_MODE"] = "1" fleet.init_worker() time.sleep(10) fleet.stop_worker()
def main(args): role = role_maker.PaddleCloudRoleMaker() fleet.init(role) data = pgl.dataset.RedditDataset(args.normalize, args.symmetry) log.info("Preprocess finish") log.info("Train Examples: %s" % len(data.train_index)) log.info("Val Examples: %s" % len(data.val_index)) log.info("Test Examples: %s" % len(data.test_index)) log.info("Num nodes %s" % data.graph.num_nodes) log.info("Num edges %s" % data.graph.num_edges) log.info("Average Degree %s" % np.mean(data.graph.indegree())) graph = data.graph train_index = data.train_index val_index = data.val_index test_index = data.test_index train_label = data.train_label val_label = data.val_label test_label = data.test_label loss, acc = build_net( input_size=data.feature.shape[-1], num_class=data.num_classes, hidden_size=args.hidden_size, num_layers=len(args.samples)) test_program = paddle.static.default_main_program().clone(for_test=True) strategy = fleet.DistributedStrategy() strategy.a_sync = True optimizer = paddle.fluid.optimizer.Adam(learning_rate=args.lr) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(loss) if fleet.is_server(): fleet.init_server() fleet.run_server() else: place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() train_ds = ShardedDataset(train_index, train_label) valid_ds = ShardedDataset(val_index, val_label) test_ds = ShardedDataset(test_index, test_label) collate_fn = partial(batch_fn, graph=graph, samples=args.samples) train_loader = Dataloader( train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) valid_loader = Dataloader( valid_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) test_loader = Dataloader( test_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) compiled_prog, cpu_num = setup_compiled_prog(loss) for epoch in tqdm.tqdm(range(args.epoch)): train_loss, train_acc = run(train_loader, data.feature, exe, compiled_prog, loss, acc, phase="train", cpu_num=cpu_num) valid_loss, valid_acc = run(valid_loader, data.feature, exe, test_program, loss, acc, phase="valid", cpu_num=1) log.info("Epoch %s Valid-Loss %s Valid-Acc %s" % (epoch, valid_loss, valid_acc)) test_loss, test_acc = run(test_loader, data.feature, exe, test_program, loss, acc, phase="test", cpu_num=1) log.info("Epoch %s Test-Loss %s Test-Acc %s" % (epoch, test_loss, test_acc)) fleet.stop_worker()
def runtime_main(test_class): parser = argparse.ArgumentParser(description='Run Fleet test.') parser.add_argument('--role', type=str, required=True, choices=['pserver', 'trainer']) parser.add_argument('--endpoints', type=str, required=False, default="") parser.add_argument('--trainer_endpoints', type=str, required=False, default="") parser.add_argument('--gloo_path', type=str, required=False, default="") parser.add_argument('--current_id', type=int, required=False, default=0) parser.add_argument('--trainers', type=int, required=False, default=1) parser.add_argument('--mode', type=str, required=False, default='geo') parser.add_argument('--geo_sgd_need_push_nums', type=int, required=False, default=2) parser.add_argument('--reader', type=str, required=False, default='dataset') parser.add_argument('--test', type=int, required=False, default=0) parser.add_argument('--model_dir', type=str, required=False, default="") args = parser.parse_args() model = test_class() role = model.build_role(args) # for distributed inference if args.test and args.model_dir != "": avg_cost = model.net(args, is_train=False) dist_infer = DistributedInfer() dist_infer.init_distributed_infer_env(exe=model.get_executor(), loss=model.avg_cost, role_maker=role, dirname=args.model_dir) if fleet.is_worker(): with paddle.static.program_guard( main_program=dist_infer.get_dist_infer_program()): model.do_distributed_testing(fleet) fleet.stop_worker() return if fleet.is_server(): return fleet.init(role) strategy = model.build_strategy(args) avg_cost = model.net(args) model.build_optimizer(avg_cost, strategy) if args.role == "pserver": model.run_pserver(args) else: if args.reader == "dataset": model.run_dataset_trainer(args) else: model.run_pyreader_trainer(args) if args.test: test_origin_program = paddle.static.Program() test_startup_program = paddle.static.Program() with paddle.static.program_guard( main_program=test_origin_program, startup_program=test_startup_program): with paddle.utils.unique_name.guard(): avg_cost = model.net(args, is_train=False) dist_infer = DistributedInfer(main_program=test_origin_program, startup_program=test_startup_program) with paddle.static.program_guard( main_program=dist_infer.get_dist_infer_program()): model.do_distributed_testing(fleet) fleet.stop_worker()
def test_stop_worker(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) if fleet.is_worker(): fleet.stop_worker()
fleet.init(is_collective=False) model = WideDeepModel() model.net(is_train=True) optimizer = paddle.optimizer.SGD(learning_rate=0.0001) strategy = fleet.DistributedStrategy() strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(model.cost) if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() distributed_training(exe, model) clear_metric_state(model, place) distributed_infer(exe, model) fleet.stop_worker()
def test_communicator_ps_gpu(self): with open("test_communicator_ps_gpu.txt", "w") as f: data = "1 0.6 1 0.7\n" f.write(data) os.environ["PADDLE_PSERVER_NUMS"] = "2" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ[ "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001" os.environ[ "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002,127.0.0.2:36002" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["FLAGS_selected_gpus"] = "0" role = role_maker.PaddleCloudRoleMaker() fleet.init(role) x = fluid.layers.data(name='x', shape=[1], dtype='float32') y = fluid.layers.data(name='y', shape=[1], dtype='float32') slots_vars = [x, y] cost = fluid.layers.square_error_cost(input=x, label=y) avg_cost = fluid.layers.mean(cost) optimizer = fluid.optimizer.Adam(0.01) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True strategy.a_sync_configs = { "launch_barrier": False, "use_ps_gpu": 1, } startup_program = paddle.static.Program() main_program = paddle.static.Program() optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) dataset = paddle.distributed.InMemoryDataset() dataset.init(batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars) dataset.set_filelist(["test_communicator_ps_gpu.txt"]) dataset.set_date("20211111") dataset.load_into_memory(is_shuffle=True) os.environ["TEST_MODE"] = "1" exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup_program) main_program._fleet_opt = {"stat_var_names": [x.name]} fleet.init_worker() try: exe.train_from_dataset(main_program, dataset) except ImportError as e: pass except Exception as e: self.assertTrue(False) time.sleep(10) fleet.stop_worker() os.remove("./test_communicator_ps_gpu.txt")
def main(args): paddle.set_device("cpu") paddle.enable_static() fleet.init() if args.num_nodes is None: num_nodes = load(args.dataset).num_nodes else: num_nodes = args.num_nodes loss = StaticSkipGramModel(num_nodes, args.neg_num, args.embed_size, sparse=True) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = load(args.dataset) # bind gen train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) cpu_num = int(os.environ.get('CPU_NUM', 1)) if int(cpu_num) > 1: parallel_places = [paddle.CPUPlace()] * cpu_num exec_strategy = paddle.static.ExecutionStrategy() exec_strategy.num_threads = int(cpu_num) build_strategy = paddle.static.BuildStrategy() build_strategy.reduce_strategy = paddle.static.BuildStrategy.ReduceStrategy.Reduce compiled_prog = paddle.static.CompiledProgram( paddle.static.default_main_program()).with_data_parallel( loss_name=loss.name, places=parallel_places, build_strategy=build_strategy, exec_strategy=exec_strategy) else: compiled_prog = paddle.static.default_main_program() for epoch in range(args.epoch): train_loss = train(exe, compiled_prog, data_loader, loss) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) fleet.stop_worker() if fleet.is_first_worker(): fleet.save_persistables(exe, "./model", paddle.static.default_main_program())