def test_mlp_pp(self): global _global_parallel_strategy _global_parallel_strategy = "pp" global _global_process_mesh _global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) global PP_MESH_0 PP_MESH_0 = auto.ProcessMesh(mesh=[0]) global PP_MESH_1 PP_MESH_1 = auto.ProcessMesh(mesh=[1]) train_program = paddle.static.Program() startup_program = paddle.static.Program() dist_context = DistributedContext() rank_id = 1 dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, rank_id) for key in list(_g_process_group_map.keys()): del _g_process_group_map[key] reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) # print_program_with_dist_attr(dist_main_prog, dist_context) # check send and recv result self.assertTrue(check_send_recv_result(dist_main_prog, rank_id)) # parameter initialization of every rank should be different in the pipeline scene self.assertTrue(check_initialization(dist_startup_prog, rank_id))
def get_model(self, place, gradient_merge, batch_size, max_step): paddle.seed(2021) random.seed(2021) np.random.seed(2021) hidden_size = 128 global _global_parallel_strategy global _global_process_mesh world_size = paddle.distributed.get_world_size() if world_size == 1: _global_parallel_strategy = "dp" _global_process_mesh = auto.ProcessMesh([0]) elif world_size == 2: _global_parallel_strategy = "dp" _global_process_mesh = auto.ProcessMesh([0, 1]) train_program = static.Program() startup_program = static.Program() dist_strategy = fleet.DistributedStrategy() dist_strategy.semi_auto = True #if gradient_merge: # dist_strategy.gradient_merge = True # dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True} fleet.init(is_collective=True, strategy=dist_strategy) with static.program_guard(train_program, startup_program), \ utils.unique_name.guard(): input = static.data(name="input", shape=[batch_size, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, 1], dtype='float32') input.stop_gradient = False loss = mlp_forward(input, label, hidden_size) optimizer = paddle.fluid.optimizer.SGDOptimizer(learning_rate=0.01) #optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer) _, self._params_grads, dist_startup_prog, dist_main_prog = optimizer.minimize( loss, startup_program) input_data = np.random.random(size=(128, hidden_size)).astype('float32') label_data = np.random.random(size=(128, 1)).astype('float32') def reader(): for i in range(max_step): x_data = input_data[i * batch_size:(i + 1) * batch_size, :] y_data = label_data[i * batch_size:(i + 1) * batch_size, :] yield x_data, y_data return dist_main_prog, dist_startup_prog, [input, label], [loss], reader
def test_input_invalid(self): set_default_distributed_context(None) global _global_parallel_strategy _global_parallel_strategy = "mp" global _global_process_mesh _global_process_mesh = auto.ProcessMesh([0, 1]) dist_main_prog, _, _ = get_distributed_program() with self.assertRaises(TypeError): save_distributed_checkpoint(dist_main_prog, [""], [""], addition_info=[0]) with self.assertRaises(ValueError): save_distributed_checkpoint(dist_main_prog, [""], [""], addition_info={"step": 0}) with self.assertRaises(ValueError): save_distributed_checkpoint(dist_main_prog, [""], [""], addition_info={"batch": 0.0}) with self.assertRaises(ValueError): load_checkpoint_into_program(["./model_state_rank.pdmodel"], ["./dist_attr_rank.pdattr"], dist_main_prog) with self.assertRaises(ValueError): load_distributed_checkpoint(["./model_state_rank.pdmodel"], ["./dist_attr_rank.pdattr"]) with self.assertRaises(TypeError): load_distributed_checkpoint({"0": "./model_state_rank.pdmodel"}, {"1": "./dist_attr_rank.pdattr"})
def train(): global _global_process_mesh _global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) dist_strategy = fleet.DistributedStrategy() dist_strategy.amp = False dist_strategy.pipeline = False dist_strategy.recompute = False # init parallel optimizer dist_strategy.semi_auto = True fleet.init(is_collective=True, strategy=dist_strategy) train_program = static.Program() start_program = static.Program() loss, train_program, start_program, loader = mlp_pretrain_forward( train_program, start_program) optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08, grad_clip=None) optimizer = fleet.distributed_optimizer(optimizer) _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( loss, start_program) places = static.cuda_places() loader.set_batch_generator(batch_generator_creator(), places=places) exe = paddle.static.Executor(places[0]) exe.run(distributed_startup_program) for data in loader(): exe.run(distributed_main_program, feed=data, fetch_list=[loss])
def test_new_local_tensor(self): test_auto_parallel_reshard._global_process_mesh = auto.ProcessMesh( mesh=[0, 1]) test_auto_parallel_reshard._global_parallel_strategy = "dp" train_program = paddle.static.Program() startup_program = paddle.static.Program() dist_context = DistributedContext() rank_id = 0 dist_main_prog, dist_startup_prog, complete_train_program = get_dist_prog( train_program, startup_program, dist_context, rank_id) dist_context.dist_main_programs[rank_id] = dist_main_prog dist_context.dist_startup_programs[rank_id] = dist_startup_prog name = "layer_norm_1.tmp_2" dist_tensor = dist_context.get_dist_tensor_for_program( complete_train_program.global_block().vars[name]) dist_tensor._dist_context = dist_context intermediate_var_0 = dist_tensor.new_local_tensor( name="intermediate_var_0") self.assertEqual(intermediate_var_0.shape, (2, 1024)) self.assertEqual(intermediate_var_0.name, "intermediate_var_0") rank_id = 1 train_program = paddle.static.Program() startup_program = paddle.static.Program() dist_context = DistributedContext() dist_main_prog, dist_startup_prog, complete_train_program = get_dist_prog( train_program, startup_program, dist_context, rank_id, None) dist_context.dist_main_programs[rank_id] = dist_main_prog dist_context.dist_startup_programs[rank_id] = dist_startup_prog name = "layer_norm_1.tmp_2" dist_tensor = dist_context.get_dist_tensor_for_program( complete_train_program.global_block().vars[name]) dist_tensor._dist_context = dist_context intermediate_var_1 = dist_tensor.new_local_tensor( rank=rank_id, name="intermediate_var_1") self.assertEqual(intermediate_var_0.shape, (2, 1024)) self.assertEqual(intermediate_var_1.name, "intermediate_var_1") name = "linear_0.w_0" dist_tensor = dist_context.get_dist_tensor_for_program( complete_train_program.global_block().vars[name]) dist_tensor._dist_context = dist_context intermediate_var_1 = dist_tensor.new_local_tensor( rank=rank_id, name="linear_0.w_0_intermediate") self.assertEqual(intermediate_var_1.shape, (1024, 4096)) self.assertEqual(intermediate_var_1.name, "linear_0.w_0_intermediate") copied_dist_context = copy.deepcopy(dist_context) self.assertIsNotNone(copied_dist_context) self.assertEqual( id(copied_dist_context), id( copied_dist_context.get_dist_tensor_for_program( dist_tensor.serial_tensor).dist_context))
def test_update(self): train_program = paddle.static.Program() startup_program = paddle.static.Program() _, train_program, startup_program = mlp_forward( train_program, startup_program) global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) dist_context = DistributedContext() set_default_dist_attr(train_program, dist_context, global_process_mesh) ops = train_program.global_block().ops vars = train_program.global_block().vars from paddle.distributed.auto_parallel.operators.common import get_distributed_operator_impl_container from paddle.distributed.auto_parallel.operators.common import is_elementwise_op from paddle.distributed.auto_parallel.dist_op import DistributedOperator for op in ops: dist_op_impl_container = get_distributed_operator_impl_container( op.type) if dist_op_impl_container is None: op_dist_attr = dist_context.get_op_dist_attr_for_program(op) dist_op = DistributedOperator(op, op_dist_attr) if is_elementwise_op(op.type): changed = update_op_dims_mapping_by_elementwise_like_dist_impl( dist_op) self.assertFalse(changed) dist_op.dist_attr.set_output_dims_mapping( op.output_arg_names[0], [0] + [ -1 for i in range( 1, len(vars[op.output_arg_names[0]].shape)) ]) try: changed = update_op_dims_mapping_by_elementwise_like_dist_impl( dist_op) except: continue self.assertTrue(changed) else: changed = update_op_dims_mapping_by_default_dist_impl( dist_op) self.assertFalse(changed) dist_op.dist_attr.set_output_dims_mapping( op.output_arg_names[0], [0] + [ -1 for i in range( 1, len(vars[op.output_arg_names[0]].shape)) ]) try: changed = update_op_dims_mapping_by_default_dist_impl( dist_op) except: continue self.assertTrue(changed)
def make_program_serial(): main_program = paddle.fluid.Program() start_program = paddle.fluid.Program() with paddle.static.program_guard(main_program, start_program): x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32') x.stop_gradient = False auto.shard_tensor(x, dist_attr={ "process_mesh": auto.ProcessMesh([0]), "dims_mapping": [-1, -1, -1] }) tmp_0 = paddle.norm(x, p=2) return main_program, start_program, tmp_0
def test_allgather(self): train_program = paddle.static.Program() startup_program = paddle.static.Program() process_mesh = auto.ProcessMesh(mesh=[0, 3]) with static.program_guard(train_program, startup_program): x = paddle.static.data(name="x", shape=[4, 4], dtype='float32') x = auto.shard_tensor(x, dist_attr={ "process_mesh": process_mesh, "dims_mapping": [0, -1] }) w = paddle.static.data(name="w", shape=[4, 4], dtype='float32') w = auto.shard_tensor(w, dist_attr={ "process_mesh": process_mesh, "dims_mapping": [-1, -1] }) # y = paddle.distributed.shard_op(paddle.matmul, process_mesh, { # x.name: [-1, -1], # w.name: [-1, -1] # }, **{"x": x, # "y": w})[0] y = paddle.distributed.shard_op(paddle.matmul, dist_attr={ "process_mesh": process_mesh, x: { "dims_mapping": [-1, -1] }, w: { "dims_mapping": [-1, -1] } })(x, w)[0] rank_id = 0 dist_context = DistributedContext() dist_strategy = fleet.DistributedStrategy() partitioner = Partitioner(dist_context, rank_id) completer = Completer(dist_context) complete_train_program = completer.complete_forward_annotation( train_program) dist_context.block_state.parse_forward_blocks(complete_train_program) partitioned_main_prog, partitioned_startup_prog, partitioned_params_grads = partitioner.partition( complete_train_program, startup_program, []) resharder = Resharder(partitioned_main_prog, partitioned_startup_prog, rank_id, dist_context, partitioned_params_grads) resharder.reshard() # the x should not be slice self.assertTrue(check_allgather(partitioned_main_prog))
def test_decoder_dp(self): global _global_parallel_strategy _global_parallel_strategy = "dp" global _global_process_mesh _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3]) train_program = static.Program() start_program = static.Program() dist_context = DistributedContext() train_program, start_program = decoder_pretrain_forward( train_program, start_program) completer = Completer(dist_context) complete_train_program = completer.complete_forward_annotation( train_program) self.assertTrue(dist_context.validate_dist_attr_for_program())
def test_mlp_serial(self): global _global_parallel_strategy _global_parallel_strategy = None global _global_process_mesh _global_process_mesh = auto.ProcessMesh(mesh=[0]) train_program = paddle.static.Program() startup_program = paddle.static.Program() dist_context = get_default_distributed_context() rank_id = 0 dist_main_prog, dist_startup_prog = get_dist_prog_with_parallelizer( train_program, startup_program, dist_context) # send and recv should not exist in serial scene. self.assertFalse(check_send_recv_result(dist_main_prog, rank_id))
def make_program_dp2(): main_program = paddle.fluid.Program() start_program = paddle.fluid.Program() with paddle.static.program_guard(main_program, start_program): x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32') auto.shard_tensor(x, dist_attr={ "process_mesh": auto.ProcessMesh([0, 1]), "dims_mapping": [0, -1, -1] }) tmp_0 = x[0] tmp_1 = x[:, 0, :] tmp_2 = x[:, :, 1] tmp_3 = x[:2, :2, :2] return main_program, start_program
def test_attn_dp(self): global _global_parallel_strategy _global_parallel_strategy = "dp" global _global_process_mesh _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3]) train_program = static.Program() start_program = static.Program() dist_context = DistributedContext() train_program, start_program = attn_pretrain_forward( train_program, start_program) complete_train_program = auto.complete_annotation( train_program, dist_context) # print_program_with_dist_attr(complete_train_program, # dist_context) self.assertTrue(dist_context.validate_dist_attr_for_program())
def make_program_dp2(): main_program = paddle.fluid.Program() start_program = paddle.fluid.Program() with paddle.static.program_guard(main_program, start_program): x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32') x.stop_gradient = False auto.shard_tensor(x, dist_attr={ "process_mesh": auto.ProcessMesh([0, 1]), "dims_mapping": [0, -1, -1] }) tmp_0 = paddle.reshape(x, shape=[0, 0, 4, 2]) tmp_1 = paddle.reshape(tmp_0, shape=[0, 0, 8]) tmp_2 = tmp_1.reshape((tmp_1.shape[0], tmp_1.shape[1], -1)) return main_program, start_program
def test_mlp_mp(self): global _global_parallel_strategy _global_parallel_strategy = "mp" global _global_process_mesh _global_process_mesh = auto.ProcessMesh([0, 1]) dist_main_prog, dist_start_prog, loss = get_distributed_program() place = paddle.set_device("gpu") exe = paddle.static.Executor(place) exe.run(dist_start_prog) input = np.random.random(size=(80, 64)).astype('float32') label = np.random.random(size=(80, 1)).astype('float32') for step in range(20): if step == 10: path = "./output_mp{}".format(paddle.distributed.get_rank()) os.makedirs(path, exist_ok=True) save_distributed_checkpoint(dist_main_prog, path, path) res = exe.run(dist_main_prog, feed={ "input": input[step * 4:(step + 1) * 4, :], "label": label[step * 4:(step + 1) * 4, :] }, fetch_list=[loss]) last_res = res[0] ckpt_path = [ "./output_mp0/model_state_rank0.pdmodel", "./output_mp1/model_state_rank1.pdmodel" ] dist_attr_path = [ "./output_mp0/dist_attr_rank0.pdattr", "./output_mp1/dist_attr_rank1.pdattr" ] load_checkpoint_into_program(ckpt_path, dist_attr_path, dist_main_prog) for step in range(10, 20): res = exe.run(dist_main_prog, feed={ "input": input[step * 4:(step + 1) * 4, :], "label": label[step * 4:(step + 1) * 4, :] }, fetch_list=[loss]) self.assertEqual(last_res, res[0]) shutil.rmtree("./output_mp{}".format(paddle.distributed.get_rank()))
def test_instance_method(self): tensor_dist_attr = TensorDistributedAttribute() tensor_dist_attr.dims_mapping = [1, 0] tensor_dist_attr.process_mesh = auto.ProcessMesh( mesh=[[0, 1, 2], [3, 4, 5]]) serial_tensor = paddle.static.data(name="data", shape=[6, 6], dtype='float32') dist_tensor = DistributedTensor(serial_tensor, tensor_dist_attr) # rank 0 [(0, 2), (0, 3)] # rank 1 [(2, 4), (0, 3)] # rank 4 [(2, 4), (3, 6)] rank = 0 local_sizes = dist_tensor.local_sizes(rank) self.assertEqual(local_sizes, [2, 3]) local_offsets = dist_tensor.local_offsets(rank) self.assertEqual(local_offsets, [0, 0]) local_shard = dist_tensor.local_shard(rank) self.assertEqual(local_shard, [(0, 2), (0, 3)]) self.assertEqual(local_sizes, dist_tensor.local_sizes(rank)) self.assertEqual(local_offsets, dist_tensor.local_offsets(rank)) self.assertEqual(local_shard, dist_tensor.local_shard(rank)) self.assertEqual(local_sizes, dist_tensor.local_sizes()) self.assertEqual(local_offsets, dist_tensor.local_offsets()) self.assertEqual(local_shard, dist_tensor.local_shard()) rank = 1 local_sizes = dist_tensor.local_sizes(rank) self.assertEqual(local_sizes, [2, 3]) local_offsets = dist_tensor.local_offsets(rank) self.assertEqual(local_offsets, [2, 0]) local_shard = dist_tensor.local_shard(rank) self.assertEqual(local_shard, [(2, 4), (0, 3)]) rank = 4 local_sizes = dist_tensor.local_sizes(rank) self.assertEqual(local_sizes, [2, 3]) local_offsets = dist_tensor.local_offsets(rank) self.assertEqual(local_offsets, [2, 3]) local_shard = dist_tensor.local_shard(rank) self.assertEqual(local_shard, [(2, 4), (3, 6)]) global_sizes = dist_tensor.global_sizes() self.assertEqual(global_sizes, (6, 6))
def test_mlp_dp(self): global _global_parallel_strategy _global_parallel_strategy = "dp" global _global_process_mesh _global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) train_program = paddle.static.Program() startup_program = paddle.static.Program() dist_context = DistributedContext() rank_id = 0 dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, rank_id) reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) # send and recv should not exist in dp scene. self.assertFalse(check_send_recv_result(dist_main_prog, rank_id)) # all parameters should be initialized in dp scene self.assertTrue(check_initialization_for_dp(dist_startup_prog))
def create_model(train_program, start_program): with paddle.static.program_guard(train_program, start_program): MESH_0 = auto.ProcessMesh([0, 1]) input = paddle.static.data(name='input', shape=[8, 8]) label = paddle.static.data(name='label', shape=[8, 8]) weight_attr = paddle.ParamAttr( initializer=nn.initializer.Normal(mean=0.0, std=0.02)) linear0 = nn.Linear(8, 8, weight_attr) linear1 = nn.Linear(8, 8, weight_attr) auto.shard_tensor(input, dist_attr={ "process_mesh": MESH_0, "dims_mapping": [-1, -1] }) auto.shard_tensor(label, dist_attr={ "process_mesh": MESH_0, "dims_mapping": [-1, -1] }) auto.shard_tensor(linear0.weight, dist_attr={ "process_mesh": MESH_0, "dims_mapping": [-1, 0] }) auto.shard_tensor(linear1.weight, dist_attr={ "process_mesh": MESH_0, "dims_mapping": [0, -1] }) linear0_out = linear0(input) gelu_out = F.gelu(linear0_out) linear1_out = linear1(gelu_out) error_cost = paddle.nn.functional.square_error_cost( linear1_out, label) loss = paddle.mean(error_cost) return train_program, start_program, loss, input, label
def test_complete_backward_annotation(self): global _global_process_mesh _global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) train_program = paddle.static.Program() startup_program = paddle.static.Program() dist_context = DistributedContext() rank_id = 0 dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, 0) op_need_check = None for op in dist_main_prog.global_block().ops: if op.type == "gelu_grad": op_need_check = op break # print_program_with_dist_attr(dist_main_prog, dist_context) # grad op should have dist attr self.assertTrue( check_backward_dist_attr(dist_context, dist_main_prog, op_need_check))
def init_prog(self): # block = self.main_program.global_block() # block = self.main_program.global_block() self.w = self.layer_help.create_parameter( dtype="float", shape=[20], attr=None) self.w_grad = paddle.static.data( name='w_grad', shape=[20], dtype='float') self.tmp1 = paddle.static.data(name='tmp1', shape=[20], dtype='float') self.tmp2 = paddle.static.data(name='tmp2', shape=[20], dtype='float') self.batch_reduced = paddle.static.data( name='batch_reduced', shape=[1], dtype='float') self.attrs = {} default_dist_context = get_default_distributed_context() _global_process_mesh = auto.ProcessMesh(list(range(nranks))) tensor_dist_attr = set_var_dist_attr( default_dist_context, self.tmp1, [-1], _global_process_mesh, mark_annotated=True) tensor_dist_attr = set_var_dist_attr( default_dist_context, self.tmp1, [-1], _global_process_mesh, mark_annotated=True) op = self.layer_help.append_op( type="add_p", inputs={'X': self.tmp1, 'Y': self.w}, outputs={'Z': self.w_grad}, attrs=self.attrs) op = self.layer_help.append_op( type="reduce_p", inputs={'X': self.tmp2}, outputs={'Y': self.batch_reduced}, attrs={"axis": [0]})
def test_mlp_serial(self): global _global_process_mesh _global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) dist_strategy = fleet.DistributedStrategy() dist_strategy.amp = False dist_strategy.pipeline = False dist_strategy.recompute = False # init parallel optimizer dist_strategy.semi_auto = True fleet.init(is_collective=True, strategy=dist_strategy) train_program = static.Program() start_program = static.Program() loss, train_program, start_program = mlp_pretrain_forward( train_program, start_program) optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08, grad_clip=None) optimizer = fleet.distributed_optimizer(optimizer) _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( loss, start_program) suffix = core.kAutoParallelSuffix() for block in distributed_main_program.blocks: for op in block.ops: for attr_name in op.attr_names: self.assertTrue(suffix not in attr_name) # print_program_with_dist_attr(distributed_main_program) self.assertIsNotNone(distributed_startup_program) self.assertIsNotNone(distributed_main_program)
def get_gpt_model(self, strategy, place, batch_size, sequence_len, vocab_size): modeling.init_global() if strategy == "dp": modeling._global_parallel_strategy = "dp" modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) elif strategy == "mp": modeling._global_parallel_strategy = "mp" modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) else: raise ValueError("'get_gpt_model' only support dp and mp.") tokens = paddle.static.data(name="tokens", shape=[batch_size, sequence_len], dtype='int64') position_ids = paddle.static.data(name="position_ids", shape=[batch_size, sequence_len], dtype='int64') attention_mask = paddle.static.data( name="attention_mask", shape=[batch_size, 1, sequence_len, sequence_len], dtype='float32') labels = paddle.static.data(name="labels", shape=[batch_size, sequence_len], dtype='int64') loss_mask = paddle.static.data(name="loss_mask", shape=[batch_size, sequence_len], dtype='float32') data_holder = [tokens, position_ids, attention_mask, labels, loss_mask] if modeling._global_parallel_strategy == "dp": auto.shard_tensor(tokens, dist_attr={ "process_mesh": modeling._global_process_mesh, "dims_mapping": [0, -1] }) elif modeling._global_parallel_strategy == "pp": auto.shard_tensor(tokens, dist_attr={ "process_mesh": modeling.PP_MESH_LIST[0], "dims_mapping": [-1, -1] }) auto.shard_tensor(attention_mask, dist_attr={ "process_mesh": modeling.PP_MESH_LIST[0], "dims_mapping": [-1, -1, -1, -1] }) gpt = GPTModel(vocab_size=1000, hidden_size=64, num_hidden_layers=2, num_attention_heads=8, intermediate_size=256, hidden_act="gelu", hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, max_position_embeddings=1024, type_vocab_size=1, initializer_range=0.02, pad_token_id=0, eos_token_id=7, bos_token_id=0, eol_token_id=3) model = GPTForPretraining(gpt, vocab_size=1000, hidden_size=64, initializer_range=0.02) preds = model(tokens, position_ids, attention_mask) criterion = GPTPretrainingCriterion() loss = criterion(preds, labels, loss_mask) clip = paddle.nn.ClipGradByNorm(clip_norm=1.0) optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08, grad_clip=clip) optimizer = fleet.distributed_optimizer(optimizer) startup_program = paddle.static.default_startup_program() _, _, dist_startup_prog, dist_main_prog = optimizer.minimize( loss, startup_program) def gen_data(): np.random.seed(2021) for _ in range(10): tokens = [] position_ids = [] attention_mask = [] labels = [] loss_mask = [] for _ in range(batch_size): tokens.append( np.random.randint(vocab_size, size=sequence_len)) position_ids.append(np.arange(sequence_len)) attention_mask.append([np.tril(np.ones(sequence_len))]) labels.append( np.random.randint(vocab_size, size=sequence_len)) loss_mask.append(np.ones(sequence_len)) yield tokens, position_ids, attention_mask, labels, loss_mask return dist_main_prog, dist_startup_prog, data_holder, [loss], gen_data
def enum_valid_dist_attr_for_program(program, process_mesh_topology, is_pipeline=False): """Enumerate valid distributed attributes for all ops in program.""" valid_dist_attr_dict = OrderedDict() ops = program.global_block().ops vars = program.global_block().vars processes = reduce(lambda x, y: x * y, process_mesh_topology) global_group = [i for i in range(processes)] global_process_mesh = None pipeline_process_meshes = None # in the pipeline mode, there are some process meshes if is_pipeline: pipeline_stages = process_mesh_topology[-1] op_count_per_stage = len(ops) // pipeline_stages if len(process_mesh_topology) > 1: process_mesh_shape = process_mesh_topology[:-1] per_process_mesh_group = processes // pipeline_stages pipeline_process_meshes = [auto.ProcessMesh(mesh=np.array(global_group[i*per_process_mesh_group: \ (i+1)*per_process_mesh_group]).reshape(process_mesh_shape).tolist()) for i in range(pipeline_stages)] elif len(process_mesh_topology) == 1: pipeline_process_meshes = [ auto.ProcessMesh(mesh=[i]) for i in range(pipeline_stages) ] else: if len(process_mesh_topology) > 1: global_process_mesh = auto.ProcessMesh(mesh=np.array( global_group).reshape(process_mesh_topology).tolist()) else: global_process_mesh = auto.ProcessMesh(mesh=global_group) # enumerate valid distributed attribute for each op in the program for idx, op in enumerate(ops): op_valid_dist_attrs = None op_process_mesh = global_process_mesh pipeline_stage = -1 if pipeline_process_meshes is not None: pipeline_stage = idx // op_count_per_stage if idx // op_count_per_stage < len( pipeline_process_meshes) else idx // op_count_per_stage - 1 if pipeline_stage >= len(pipeline_process_meshes): pipeline_stage = len(pipeline_process_meshes) - 1 op_process_mesh = pipeline_process_meshes[pipeline_stage] if op.type in PlanSpace.not_enum_ops: op_dist_attr = OperatorDistributedAttribute() op_dist_attr.process_mesh = op_process_mesh for var_name in op.input_arg_names: if var_name in PlanSpace.special_vars: op_dist_attr.set_input_dims_mapping(var_name, []) else: dims_mapping = [-1 for i in vars[var_name].shape] op_dist_attr.set_input_dims_mapping( var_name, dims_mapping) for var_name in op.output_arg_names: if var_name in PlanSpace.special_vars: op_dist_attr.set_output_dims_mapping(var_name, []) else: dims_mapping = [-1 for i in vars[var_name].shape] op_dist_attr.set_output_dims_mapping( var_name, dims_mapping) op_valid_dist_attrs = [op_dist_attr] pipeline_stage = 0 if pipeline_stage != -1 else pipeline_stage else: op_valid_dist_attrs = PlanSpace._enum_valid_dist_attr_for_op( program, op, op_process_mesh) assert op_valid_dist_attrs is not None, "Enumerate {} valid distributed attribute failed.".format( op) valid_dist_attr_dict[op.desc.id()] = [ op_valid_dist_attrs, pipeline_stage ] return valid_dist_attr_dict, pipeline_process_meshes, global_process_mesh
import paddle.nn.functional as F import paddle.utils as utils import paddle.distributed.auto_parallel as auto from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.dist_context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.partitioner import Partitioner from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer from paddle.distributed.auto_parallel.reshard import Resharder from paddle.distributed.auto_parallel.cost_model import estimate_cost import paddle.fluid.core as core from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr paddle.enable_static() _global_parallel_strategy = "dp_mp_pp" PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]]) PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]]) NUM_RANKS = 8 STAGE_0_CNT = 5 STAGE_1_CNT = 10 pp_cfg = [[0, 1, 4, 5], [2, 3, 6, 7]] device = "gpu" if core.is_compiled_with_cuda() else "cpu" class MLPLayer(nn.Layer): def __init__(self, hidden_size=256, intermediate_size=4 * 256, initializer_range=0.02, is_distributed=True):
import subprocess import paddle import paddle.nn as nn import paddle.fluid as fluid import paddle.static as static import paddle.nn.functional as F import paddle.utils as utils from paddle.fluid import layers from paddle.io import Dataset, IterableDataset, DataLoader from paddle.static import InputSpec from paddle.distributed import fleet import paddle.distributed.auto_parallel as auto from paddle.distributed.auto_parallel.engine import Engine paddle.enable_static() global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) PP_MESH_0 = auto.ProcessMesh([0]) PP_MESH_1 = auto.ProcessMesh([1]) batch_size = 1 batch_num = 10 hidden_size = 1024 sequence_len = 512 image_size = hidden_size class_num = 10 paddle.seed(44) class MyDataset(Dataset): def __init__(self, num_samples): super(MyDataset, self).__init__()
from paddle.distributed import fleet from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.partitioner import Partitioner from paddle.distributed.auto_parallel.utils import make_data_unshard from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context from paddle.distributed.auto_parallel.operators import find_compatible_distributed_operator_impls from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr paddle.enable_static() batch_size = 4 epoch_num = 10 hidden_size = 1024 sequence_len = 512 _g_process_mesh = auto.ProcessMesh([0, 1]) def get_random_inputs_and_labels(input_shape, label_shape): input = np.random.random(size=input_shape).astype('float32') label = np.random.random(size=label_shape).astype('float32') return input, label def batch_generator_creator(): def __reader__(): for _ in range(batch_size): batch_input, batch_label = get_random_inputs_and_labels( [batch_size, sequence_len, hidden_size], [batch_size, sequence_len, 1]) yield batch_input, batch_label
def test_mlp_pp2mp(self): set_default_distributed_context(None) global _global_parallel_strategy _global_parallel_strategy = "pp" global _global_process_mesh _global_process_mesh = auto.ProcessMesh([0, 1]) global PP_MESH_0 PP_MESH_0 = auto.ProcessMesh(mesh=[0]) global PP_MESH_1 PP_MESH_1 = auto.ProcessMesh(mesh=[1]) input = np.random.random(size=(80, 64)).astype('float32') label = np.random.random(size=(80, 1)).astype('float32') dist_main_prog, dist_start_prog, loss = get_distributed_program() place = paddle.set_device("gpu") exe = paddle.static.Executor(place) exe.run(dist_start_prog) for step in range(20): if step == 10: add_info = {"batch": step, "batch_size": 4} save_distributed_checkpoint(dist_main_prog, ".", ".", add_info) if paddle.distributed.get_rank() in [0]: res = exe.run(dist_main_prog, feed={ "input": input[step * 4:(step + 1) * 4, :], "label": label[step * 4:(step + 1) * 4, :] }) else: res = exe.run(dist_main_prog, feed={ "input": input[step * 4:(step + 1) * 4, :], "label": label[step * 4:(step + 1) * 4, :] }, fetch_list=[loss]) if paddle.distributed.get_rank() in [1]: last_res = res[0] set_default_distributed_context(None) _global_parallel_strategy = "mp" _global_process_mesh = auto.ProcessMesh([0, 1]) dist_main_prog_load, dist_start_prog_load, loss_load = get_distributed_program( ) place = paddle.set_device("gpu") exe = paddle.static.Executor(place) exe.run(dist_start_prog_load) ckpt_path = [ "./model_state_rank0.pdmodel", "./model_state_rank1.pdmodel" ] dist_attr_path = [ "./dist_attr_rank0.pdattr", "./dist_attr_rank1.pdattr" ] param_dict, pre_dist_attr, add_info = load_distributed_checkpoint( ckpt_path, dist_attr_path) batch = add_info["batch"] batch_size = add_info["batch_size"] start_index = batch * batch_size input = input[start_index:, :] label = label[start_index:, :] cur_dist_attr = get_dist_attr(dist_main_prog_load) sliced_param_dict = merge_and_slice_parameter(param_dict, pre_dist_attr, cur_dist_attr) load_parameter_into_program(sliced_param_dict, dist_main_prog_load) for step in range(10): res = exe.run(dist_main_prog_load, feed={ "input": input[step * 4:(step + 1) * 4, :], "label": label[step * 4:(step + 1) * 4, :] }, fetch_list=[loss_load]) if paddle.distributed.get_rank() in [1]: self.assertEqual(last_res, res[0])
import paddle.nn as nn import paddle.static as static import paddle.nn.functional as F import paddle.utils as utils import paddle.distributed.auto_parallel as auto from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.dist_context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer from paddle.distributed.auto_parallel.partitioner import Partitioner from paddle.distributed.auto_parallel.reshard import Resharder from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr paddle.enable_static() _global_parallel_strategy = "dp_mp_pp" _global_process_mesh = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]]) PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]]) PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]]) class MLPLayer(nn.Layer): def __init__(self, hidden_size=1024, intermediate_size=4 * 1024, initializer_range=0.02): super(MLPLayer, self).__init__() d_model = hidden_size dim_feedforward = intermediate_size weight_attr = paddle.ParamAttr( initializer=nn.initializer.Normal(mean=0.0, std=initializer_range)) bias_attr = None
import paddle import paddle.nn as nn import paddle.static as static import paddle.nn.functional as F import paddle.utils as utils import paddle.distributed.auto_parallel as auto from paddle.distributed.auto_parallel.dist_context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer from paddle.distributed.auto_parallel.partitioner import Partitioner from paddle.distributed.auto_parallel.reshard import reshard from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr paddle.enable_static() _global_parallel_strategy = "mp_pp" _global_process_mesh = auto.ProcessMesh([[0, 1], [2, 3]]) PP_MESH_0 = auto.ProcessMesh([0, 1]) PP_MESH_1 = auto.ProcessMesh([2, 3]) class MLPLayer(nn.Layer): def __init__(self, hidden_size=1024, intermediate_size=4 * 1024, initializer_range=0.02): super(MLPLayer, self).__init__() d_model = hidden_size dim_feedforward = intermediate_size weight_attr = paddle.ParamAttr( initializer=nn.initializer.Normal(mean=0.0, std=initializer_range)) bias_attr = None
def test_mlp_mp2pp(self): set_default_distributed_context(None) global _global_parallel_strategy _global_parallel_strategy = "mp" global _global_process_mesh _global_process_mesh = auto.ProcessMesh([0, 1]) input = np.random.random(size=(80, 64)).astype('float32') label = np.random.random(size=(80, 1)).astype('float32') dist_main_prog, dist_start_prog, loss = get_distributed_program() place = paddle.set_device("gpu") exe = paddle.static.Executor(place) exe.run(dist_start_prog) for step in range(20): if step == 10: save_distributed_checkpoint(dist_main_prog, ".", dist_attr_path=".") res = exe.run(dist_main_prog, feed={ "input": input[step * 4:(step + 1) * 4, :], "label": label[step * 4:(step + 1) * 4, :] }, fetch_list=[loss]) last_res = res[0] set_default_distributed_context(None) _global_parallel_strategy = "pp" _global_process_mesh = auto.ProcessMesh([0, 1]) global PP_MESH_0 PP_MESH_0 = auto.ProcessMesh(mesh=[0]) global PP_MESH_1 PP_MESH_1 = auto.ProcessMesh(mesh=[1]) dist_main_prog_load, dist_start_prog_load, loss_load = get_distributed_program( ) place = paddle.set_device("gpu") exe = paddle.static.Executor(place) exe.run(dist_start_prog_load) ckpt_path = [ "./model_state_rank0.pdmodel", "./model_state_rank1.pdmodel" ] dist_attr_path = [ "./dist_attr_rank0.pdattr", "./dist_attr_rank1.pdattr" ] load_checkpoint_into_program(ckpt_path, dist_attr_path, dist_main_prog_load) for step in range(10, 20): if paddle.distributed.get_rank() in [0]: res = exe.run(dist_main_prog_load, feed={ "input": input[step * 4:(step + 1) * 4, :], "label": label[step * 4:(step + 1) * 4, :] }) else: res = exe.run(dist_main_prog_load, feed={ "input": input[step * 4:(step + 1) * 4, :], "label": label[step * 4:(step + 1) * 4, :] }, fetch_list=[loss_load]) if paddle.distributed.get_rank() in [1]: self.assertEqual(last_res, res[0])
def test_gpt_dp_mp(self): global _global_parallel_strategy _global_parallel_strategy = "dp_mp" global _global_process_mesh _global_process_mesh = auto.ProcessMesh( mesh=[[0, 1, 2, 3], [4, 5, 6, 7]]) train_program = static.Program() startup_program = static.Program() parallelizer = AutoParallelizer(FakeFleet()) dist_context = parallelizer._dist_context dist_context.process_mesh = _global_process_mesh train_program, startup_program, loss = gpt_pretrain_forward( train_program, startup_program) complete_train_program = auto.complete_annotation( train_program, dist_context) # serial backward pass params_grads = parallelizer._generate_backward(complete_train_program, startup_program, loss, parameter_list=None, no_grad_set=None, callbacks=None) rank_id = 3 partitioner = Partitioner(dist_context, rank_id) auto_parallel_main_prog, auto_parallel_startup_prog, params_grads = partitioner.partition( complete_train_program, startup_program, params_grads) with open("./test_auto_parallel_partitioner_serial_main_new.txt", "w") as fw: fw.write(str(train_program)) with open("./test_auto_parallel_partitioner_serial_startup_new.txt", "w") as fw: fw.write(str(startup_program)) from paddle.distributed.auto_parallel.dist_context import set_default_distributed_context set_default_distributed_context(dist_context) with open("./test_auto_parallel_partitioner_main_new.txt1", "w") as fw: fw.write(str(auto_parallel_main_prog)) with open("./test_auto_parallel_partitioner_startup_new.txt1", "w") as fw: fw.write(str(auto_parallel_startup_prog)) # with open("./test_auto_parallel_partitioner_main_completed.txt", "w") as fw: # from paddle.distributed.auto_parallel.completion import complete_backward_annotation # complete_backward_annotation(auto_parallel_main_prog) # fw.write(str(auto_parallel_main_prog)) nrank = 4 # col parallel weights = [ 'linear_0.w_0', 'linear_6.w_0', 'linear_10.w_0', ] self.assertTrue( check_tensor_split(auto_parallel_main_prog, weights, complete_train_program, weights, 1, nrank)) # row parallel weights = ['word_embeddings', 'linear_9.w_0', 'linear_11.w_0'] self.assertTrue( check_tensor_split(auto_parallel_main_prog, weights, complete_train_program, weights, 0, nrank)) weights = ['pos_embeddings', 'layer_norm_0.b_0', 'layer_norm_4.w_0'] self.assertTrue( check_tensor_split(auto_parallel_main_prog, weights, complete_train_program, weights, 0, 1)) all_params = sorted( [param.name for param in startup_program.all_parameters()]) allreduce_grads = [ 'layer_norm_5.tmp_2', 'layer_norm_5.tmp_2', 'layer_norm_5.tmp_2', 'layer_norm_6.tmp_2', 'layer_norm_7.tmp_2', 'layer_norm_7.tmp_2', 'layer_norm_7.tmp_2', 'layer_norm_8.tmp_2' ] process_mesh = _global_process_mesh mp_parallel_axis = 1 dp_parallel_axis = 0 group_ranks = _get_comm_group(process_mesh.processes, process_mesh.topology, mp_parallel_axis, 3) mp_ring_id = new_process_group(group_ranks).id group_ranks = _get_comm_group(process_mesh.processes, process_mesh.topology, dp_parallel_axis, 3) dp_ring_id = new_process_group(group_ranks).id tensor_parallel_allreduce_vars = sorted([ op.desc.output_arg_names()[0].split("@")[0] for op in auto_parallel_main_prog.global_block().ops if (op.type == "c_allreduce_sum" and op.attr('op_role') == 1 and op.desc.attr("ring_id") == mp_ring_id) ]) data_parallel_allreduce_vars = sorted([ op.desc.output_arg_names()[0].split("@")[0] for op in auto_parallel_main_prog.global_block().ops if (op.type == "c_allreduce_sum" and op.desc.attr("ring_id") == dp_ring_id) ]) self.assertTrue(all_params == data_parallel_allreduce_vars) self.assertTrue(allreduce_grads == tensor_parallel_allreduce_vars) self.assertTrue( is_valid_completed_program(dist_context, auto_parallel_main_prog))