def build_role(self): environs = {} environs[ "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36012,127.0.0.1:36013" environs[ "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36014,127.0.0.1:36015" environs[ "PADDLE_ALL_HETER_TRAINER_IP_PORT_LIST"] = "127.0.0.1:36016,127.0.0.1:36017" environs[ "PADDLE_PREVIOUS_HETER_TRAINER_IP_PORT_LIST"] = "127.0.0.1:36014,127.0.0.1:36015" environs["PADDLE_HETER_TRAINER_DEVICE"] = "gpu" environs["TRAINING_ROLE"] = "HETER_TRAINER" environs["STAGE_ID"] = 2 environs["STAGE_NUM"] = 2 environs["HETER_DEVICE_TYPE"] = "gpu" environs["PADDLE_STAGE_TRAINERS_NUM"] = [2, 2] environs["PADDLE_TRAINERS_NUM"] = 2 environs["PADDLE_TRAINER_ID"] = 0 environs["POD_IP"] = "127.0.0.1" environs["PADDLE_PORT"] = "36016" environs["FLAGS_selected_gpus"] = 0 for k, v in environs.items(): os.environ[k] = str(v) self.role = role_maker.PaddleCloudRoleMaker() return self.role
def test_gradient_merge_optimizer(self): fleet.init(role_maker.PaddleCloudRoleMaker()) x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32') y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32') cost = paddle.fluid.layers.square_error_cost(input=x, label=y) avg_cost = paddle.fluid.layers.mean(cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False strategy.a_sync_configs = {"launch_barrier": False} optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) prog = paddle.fluid.default_main_program() self.assertEqual(prog.global_block().ops[-1].type, "send_barrier") sends = 0 sgds = 0 for op in prog.global_block().ops: if op.type == "send": sends += 1 if op.type == "sgd": sgds += 1 self.assertEqual(sends, 0) self.assertEqual(sgds, 0)
def test(self): os.environ["PADDLE_PSERVER_NUMS"] = "2" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ[ "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001" os.environ["TRAINING_ROLE"] = "PSERVER" role = role_maker.PaddleCloudRoleMaker() fleet.init(role) loss, acc, _ = self.net() strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True configs = {} configs['__emb__'] = { "table_parameters.__emb__.accessor.embed_sgd_param.name": "SparseNaiveSGDRule", "table_parameters.__emb__.accessor.embedx_sgd_param.name": "SparseAdamSGDRule", } strategy.sparse_table_configs = configs optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss) fleet.init_server()
def test_fs_gloo6(self): plats = platform.platform() if 'Linux' not in plats: print("skip gloo UT on MacOS/Win") return tmp = self.mkdir() os.environ["TRAINING_ROLE"] = "PSERVER" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["PADDLE_TRAINERS_NUM"] = "0" os.environ["SYS_JOB_ID"] = "gloo_for_cluster" os.environ["PADDLE_WITH_GLOO"] = "2" os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1" os.environ["PADDLE_GLOO_FS_NAME"] = "NULL" os.environ["PADDLE_GLOO_FS_UGI"] = "NULL" os.environ["PADDLE_GLOO_FS_PATH"] = tmp role = role_maker.PaddleCloudRoleMaker() role._generate_role() self.case(role, "server") self.case(role, "all") self.clean(tmp)
def test_a_sync_optimizer_pserver(self): os.environ["TRAINING_ROLE"] = "PSERVER" import paddle.distributed.fleet as fleet main_program = paddle.fluid.Program() startup_program = paddle.fluid.Program() paddle.fluid.framework.switch_main_program(main_program) paddle.fluid.framework.switch_startup_program(startup_program) fleet.init(role_maker.PaddleCloudRoleMaker()) input_x = paddle.fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) prog = paddle.fluid.default_main_program() self.assertEqual(prog.global_block().ops[0].type, "listen_and_serv") fleet.init_server()
def test_communicator_sync(self): os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_PSERVER_NUMS"] = "2" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ "127.0.0.1:36001,127.0.0.2:36001" fleet.init(role_maker.PaddleCloudRoleMaker()) avg_cost = self.net() optimizer = fluid.optimizer.SGD(0.01) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False strategy.a_sync_configs = {"launch_barrier": False} optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_worker() time.sleep(10) fleet.stop_worker()
def test_pipeline_optimizer(self): import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) input_x = paddle.fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.without_graph_optimization = True optimizer = paddle.fluid.optimizer.Adam(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost)
def test_pipeline_optimizer(self): import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) with paddle.fluid.device_guard("gpu:0"): input_x = paddle.fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') with paddle.fluid.device_guard("gpu:1"): fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.pipeline = True strategy.pipeline_configs = { 'micro_batch_size': 1, 'accumulate_steps': 2 } optimizer = paddle.fluid.optimizer.Adam(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost)
def test_get_util(self): import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) default_util = fleet.util self.assertEqual(default_util, None)
def test_a_sync_optimizer2(self): os.environ["TRAINING_ROLE"] = "TRAINER" import paddle.distributed.fleet as fleet main_program = paddle.fluid.Program() startup_program = paddle.fluid.Program() paddle.fluid.framework.switch_main_program(main_program) paddle.fluid.framework.switch_startup_program(startup_program) fleet.init(role_maker.PaddleCloudRoleMaker()) input_x = paddle.fluid.layers.data( name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy( input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.auto = True optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) self.assertTrue(optimizer.user_defined_strategy.a_sync) a_sync_configs = optimizer.user_defined_strategy.a_sync_configs self.assertTrue(a_sync_configs['k_steps'] == 800)
def net(main_prog, startup_prog): with fluid.program_guard(main_prog, startup_prog): with fluid.unique_name.guard(): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) input_x = paddle.fluid.layers.data( name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data( name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy( input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) opt = MetaOptimizerBase(optimizer) opt_ops, params_grads = opt.minimize(avg_cost) opt.apply_optimize(avg_cost, paddle.static.default_startup_program(), params_grads) return None
def test_fp16_allreduce_optimizer(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) train_prog, startup_prog = fluid.Program(), fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) ops = [op.type for op in avg_cost.block.ops] cast_out = [ op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'cast' ] cast_op_count = 0 for name in ops: if name == 'cast': cast_op_count += 1 self.assertIn('cast', ops) self.assertEqual(cast_op_count, 12) # 6 + 6, cast_fp16 + cast_fp32 for name in cast_out: self.assertIn('cast_fp16', name)
def init_fleet_with_gloo(use_gloo=True): if use_gloo: os.environ["PADDLE_WITH_GLOO"] = "1" role = role_maker.PaddleCloudRoleMaker() fleet.init(role) else: fleet.init()
def instance(self, context): import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet as fleet is_collective = (context["fleet_mode"] == 'COLLECTIVE') role = role_maker.PaddleCloudRoleMaker(is_collective=is_collective) fleet.init(role) context['fleet'] = fleet context['role'] = role context['status'] = 'network_pass'
def runtime_main(): import paddle.distributed.fleet as fleet # model definition train_prog = paddle.fluid.Program() startup_prog = paddle.fluid.Program() role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): input_x = paddle.fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.sharding = True strategy.sharding_configs = { "sharding_segment_strategy": "segment_broadcast_MB", "segment_broadcast_MB": 0.2, "sharding_degree": 2, } optimizer = paddle.fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) # execution device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) exe = fluid.Executor(place) exe.run(startup_prog) dirname = "./ut_sharding_save_model" sharding.utils.save_persistables(exe, dirname, main_program=train_prog, filename=None) out_losses = [] if six.PY2: print(pickle.dumps(out_losses)) else: sys.stdout.buffer.write(pickle.dumps(out_losses))
def __init__(self, config): self.metrics = {} self.config = config self.input_data = None self.reader = None self.exe = None self.train_result_dict = {} self.train_result_dict["speed"] = [] self.model = None self.pure_bf16 = self.config['pure_bf16'] self.role_maker = role_maker.PaddleCloudRoleMaker()
def test_fs_gloo8(self): plats = platform.platform() if 'Linux' not in plats: print("skip gloo UT on MacOS/Win") return tmp = self.mkdir() os.environ["TRAINING_ROLE"] = "PSERVER" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["PADDLE_TRAINERS_NUM"] = "0" os.environ["SYS_JOB_ID"] = "gloo_for_cluster" os.environ["PADDLE_WITH_GLOO"] = "2" os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1" os.environ["PADDLE_GLOO_FS_NAME"] = "NULL" os.environ["PADDLE_GLOO_FS_UGI"] = "NULL" os.environ["PADDLE_GLOO_FS_PATH"] = tmp def net(): x = paddle.fluid.layers.data(name='x', shape=[13], dtype='float32') y_predict = paddle.fluid.layers.fc(input=x, size=1, act=None) y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32') cost = paddle.fluid.layers.square_error_cost( input=y_predict, label=y) avg_cost = paddle.fluid.layers.mean(cost) return avg_cost from paddle.distributed import fleet role = role_maker.PaddleCloudRoleMaker() fleet.init(role) avg_cost = net() strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False optimizer = paddle.optimizer.SGD(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) comm_world = "server" fleet.util.barrier(comm_world) gather = fleet.util.all_gather(1, comm_world) self.assertEqual(gather[0], 1) all_reduce = fleet.util.all_reduce(1, "sum", comm_world) self.assertEqual(1, all_reduce) self.clean(tmp)
def setUp(self): os.environ[ "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002" os.environ["PADDLE_TRAINERS_NUM"] = str(2) os.environ["TRAINING_ROLE"] = "PSERVER" os.environ["PADDLE_PORT"] = "4001" os.environ["POD_IP"] = "127.0.0.1" role = role_maker.PaddleCloudRoleMaker() fleet.init(role) self.strategy = paddle.distributed.fleet.DistributedStrategy() self.strategy.a_sync = True
def test_traing_role(self): """Test training role.""" os.environ["TRAINING_ROLE"] = "TEST" try: import netifaces except: print("warning: no netifaces, skip test_training_role") return ro = role_maker.PaddleCloudRoleMaker(is_collective=False) self.assertRaises(ValueError, ro.generate_role)
def test_lamb_optimizer(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) ops = [op.type for op in avg_cost.block.ops] self.assertIn('lamb', ops)
def test_fp16_allreduce_not_apply_fp16_net(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) train_prog, startup_prog = fluid.Program(), fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog, dtype='float16') optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) ops = [op.type for op in avg_cost.block.ops] self.assertNotIn('cast', ops)
def test_fleet_amp_meta_optimizer_init(self): if not fluid.core.is_compiled_with_cuda(): return main_program = paddle.static.Program() startup_program = paddle.static.Program() role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) with paddle.static.program_guard(main_program, startup_program): input_x = paddle.static.data(name="x", shape=[None, 32], dtype='float32') input_y = paddle.static.data(name="y", shape=[None, 1], dtype='int64') cost = mlp(input_x, input_y) optimizer = paddle.optimizer.Momentum( learning_rate=0.001, momentum=0.9, weight_decay=fluid.regularizer.L2Decay(1e-4), multi_precision=True) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.amp = True strategy.amp_configs = {'use_pure_fp16': True} strategy.gradient_merge = True strategy.gradient_merge_configs = {"k_steps": 2} optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(cost) print(fleet._get_applied_meta_list()) loss_scale = optimizer.get_loss_scaling() place = paddle.CUDAPlace(0) exe = paddle.static.Executor(place) exe.run(startup_program) optimizer.amp_init(place) step = 3 for i in range(step): cost_val = exe.run(program=main_program, feed=gen_data(), fetch_list=[cost.name]) print(cost_val)
def main(args): paddle.set_device("cpu") paddle.enable_static() role = role_maker.PaddleCloudRoleMaker() fleet.init(role) if args.num_nodes is None: num_nodes = load(args.dataset).num_nodes else: num_nodes = args.num_nodes loss = StaticSkipGramModel( num_nodes, args.neg_num, args.embed_size, sparse=True) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = load(args.dataset) # bind gen train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader( train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) for epoch in range(args.epoch): train_loss = train(exe, paddle.static.default_main_program(), data_loader, loss) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) fleet.stop_worker()
def test_dgc_optimizer_backward(self): """ test dgc optimizer backward """ train_prog, startup_prog = fluid.Program(), fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) self.set_strategy(strategy, 'dgc') opt = fluid.optimizer.MomentumOptimizer( learning_rate=0.001, momentum=0.9) dgc_opt = DGCOptimizer(opt) role = role_maker.PaddleCloudRoleMaker(is_collective=True) dgc_opt._set_basic_info(avg_cost, role, opt, strategy) params_grads = dgc_opt.backward(avg_cost, startup_prog) ops = [op.type for op in avg_cost.block.ops] self.assertNotIn('dgc', ops)
def test_hdfs_gloo_v2(self): plats = platform.platform() if 'Linux' not in plats: print("skip gloo UT on MacOS/Win") return os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["SYS_JOB_ID"] = "gloo_for_cluster" os.environ["PADDLE_WITH_GLOO"] = "1" os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1" os.environ["PADDLE_GLOO_FS_NAME"] = "" os.environ["PADDLE_GLOO_FS_UGI"] = "" os.environ["PADDLE_GLOO_FS_PATH"] = "" role = role_maker.PaddleCloudRoleMaker() self.assertRaises(ValueError, role._generate_role)
def test_pipeline_optimizer(self): import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) with paddle.fluid.device_guard("gpu:0"): input_x = paddle.fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') fc_3 = paddle.fluid.layers.fc(input=fc_2, size=64, act='tanh') fc_4 = paddle.fluid.layers.fc(input=fc_3, size=64, act='tanh') fc_5 = paddle.fluid.layers.fc(input=fc_4, size=64, act='tanh') fc_6 = paddle.fluid.layers.fc(input=fc_5, size=64, act='tanh') with paddle.fluid.device_guard("gpu:1"): fc_7 = paddle.fluid.layers.fc(input=fc_6, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_7], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.pipeline = True strategy.pipeline_configs = { 'micro_batch_size': 1, 'accumulate_steps': 2, 'schedule_mode': '1F1B' } checkpoints = ['fc_5.tmp_0', 'fc_7.tmp_0'] strategy.recompute = True strategy.recompute_configs = { "checkpoints": checkpoints, "enable_offload": False, "checkpoint_shape": [] } optimizer = paddle.fluid.optimizer.Adam(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost)
def test_a_sync_optimizer_trainer(self): os.environ["TRAINING_ROLE"] = "TRAINER" import paddle.distributed.fleet as fleet main_program = paddle.fluid.Program() startup_program = paddle.fluid.Program() paddle.fluid.framework.switch_main_program(main_program) paddle.fluid.framework.switch_startup_program(startup_program) fleet.init(role_maker.PaddleCloudRoleMaker()) input_x = paddle.fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) prog = paddle.fluid.default_main_program() self.assertNotEqual(prog.global_block().ops[-1].type, "send_barrier") sends = 0 sgds = 0 for op in prog.global_block().ops: if op.type == "send": sends += 1 if op.type == "sgd": sgds += 1 self.assertEqual(sends, 7) self.assertEqual(sgds, 0) fleet.init_worker() time.sleep(8) fleet.stop_worker()
def test_rnn_raw_optimizer(self): import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) train_program = static.Program() start_program = static.Program() train_program, start_program, loss, optimizer, data_holders = \ rnn_pretrain_forward(train_program, start_program) with paddle.static.program_guard( train_program, start_program), paddle.utils.unique_name.guard(): strategy = fleet.DistributedStrategy() strategy.without_graph_optimization = True strategy.fuse_all_reduce_ops = True fleet.init(is_collective=True, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer) optimizer.minimize(loss)
def test_set_user_defined_util(self): import paddle.distributed.fleet as fleet class UserDefinedUtil(fleet.UtilBase): def __init__(self): super(UserDefinedUtil, self).__init__() def get_user_id(self): return 10 import paddle.distributed.fleet.base.role_maker as role_maker role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) my_util = UserDefinedUtil() fleet.util = my_util user_id = fleet.util.get_user_id() self.assertEqual(user_id, 10)
def test_amp_optimizer_backward(self): """ test amp optimizer backward """ train_prog, startup_prog = fluid.Program(), fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) opt = fluid.optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9) opt = AMPOptimizer(opt) self.set_strategy(strategy, 'amp') role = role_maker.PaddleCloudRoleMaker(is_collective=True) opt._set_basic_info(avg_cost, role, opt, strategy) params_grads = opt.backward(avg_cost, startup_prog) ops = [op.type for op in avg_cost.block.ops] self.assertIn('cast', ops) self.assertNotIn('check_finite_and_unscale', ops)