def forward(self, input_ids, position_ids=None): if position_ids is None: ones = paddle.ones_like(input_ids, dtype="int64") seq_length = paddle.cumsum(ones, axis=-1) position_ids = seq_length - ones input_embedings = self.word_embeddings(input_ids) if _global_parallel_strategy == "mp": auto.shard_tensor( self.word_embeddings.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor( self.word_embeddings.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) position_embeddings = self.position_embeddings(position_ids) embeddings = input_embedings + position_embeddings embeddings = self.dropout(embeddings) return embeddings
def mlp_forward(train_program, start_program): with static.program_guard(train_program,start_program), \ utils.unique_name.guard(): batch_size = 4 hidden_size = 64 input = static.data(name="input", shape=[batch_size, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, 1], dtype='float32') if _global_parallel_strategy == "dp_mp_pp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh[0], "dims_mapping": [0, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, initializer_range=0.02) predict = mlp(input) error_cost = paddle.nn.functional.square_error_cost(predict, label) loss = paddle.mean(error_cost) return loss, train_program, start_program
def mlp_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sequence_len = 512 input = static.data(name="input", shape=[batch_size], dtype='int32') label = static.data(name="label", shape=[batch_size, 1], dtype='float32') auto.shard_tensor(input, dist_attr={ "process_mesh": PP_MESH_0, "dims_mapping": [-1] }) auto.shard_tensor(label, dist_attr={ "process_mesh": PP_MESH_1, "dims_mapping": [-1, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, initializer_range=0.02) predict = mlp(input) error_cost = paddle.nn.functional.square_error_cost(predict, label) loss = paddle.mean(error_cost) return loss, train_program, start_program
def mlp_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sequence_len = 512 input = static.data(name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, sequence_len, 1], dtype='float32') auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mappig": [-1, -1, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) predict = mlp(input) cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(x=cost) return avg_cost, train_program, start_program
def mlp_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): input = static.data(name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, sequence_len, 1], dtype='float32') auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mappig": [-1, -1, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) predict = mlp(input) error_cost = paddle.nn.functional.square_error_cost(predict, label) loss = paddle.mean(error_cost) loader = paddle.io.DataLoader.from_generator(feed_list=[input, label], capacity=4 * batch_size, iterable=True) return loss, train_program, start_program, loader
def mlp_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sequence_len = 512 input = static.data(name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') if _global_parallel_strategy == "dp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) out = mlp(input) return train_program, start_program
def gpt_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 16 sequence_len = 512 input_ids = static.data( name="input_ids", shape=[batch_size, sequence_len], dtype='int64') position_ids = static.data( name="position_ids", shape=[batch_size, sequence_len], dtype='int64') attention_mask = static.data( name="attention_mask", shape=[batch_size, 1, sequence_len, sequence_len], dtype='float64') labels = static.data( name="labels", shape=[batch_size, sequence_len], dtype='int64') loss_mask = static.data( name="loss_mask", shape=[batch_size, sequence_len], dtype='float64') if _global_parallel_strategy == "dp": auto.shard_tensor( input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor( input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) gpt = GPTModel( vocab_size=32768, hidden_size=1024, num_hidden_layers=2, num_attention_heads=16, intermediate_size=4096, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=1024, type_vocab_size=16, initializer_range=0.02, pad_token_id=0, topo=None) model = GPTForPretraining(gpt) preds = model(input_ids, position_ids, attention_mask) criterion = GPTPretrainingCriterion() loss = criterion(preds, labels, loss_mask) return train_program, start_program
def make_program_serial(): main_program = paddle.fluid.Program() start_program = paddle.fluid.Program() with paddle.static.program_guard(main_program, start_program): x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32') x.stop_gradient = False auto.shard_tensor(x, dist_attr={ "process_mesh": auto.ProcessMesh([0]), "dims_mapping": [-1, -1, -1] }) tmp_0 = paddle.norm(x, p=2) return main_program, start_program, tmp_0
def test_allgather(self): train_program = paddle.static.Program() startup_program = paddle.static.Program() process_mesh = auto.ProcessMesh(mesh=[0, 3]) with static.program_guard(train_program, startup_program): x = paddle.static.data(name="x", shape=[4, 4], dtype='float32') x = auto.shard_tensor(x, dist_attr={ "process_mesh": process_mesh, "dims_mapping": [0, -1] }) w = paddle.static.data(name="w", shape=[4, 4], dtype='float32') w = auto.shard_tensor(w, dist_attr={ "process_mesh": process_mesh, "dims_mapping": [-1, -1] }) # y = paddle.distributed.shard_op(paddle.matmul, process_mesh, { # x.name: [-1, -1], # w.name: [-1, -1] # }, **{"x": x, # "y": w})[0] y = paddle.distributed.shard_op(paddle.matmul, dist_attr={ "process_mesh": process_mesh, x: { "dims_mapping": [-1, -1] }, w: { "dims_mapping": [-1, -1] } })(x, w)[0] rank_id = 0 dist_context = DistributedContext() dist_strategy = fleet.DistributedStrategy() partitioner = Partitioner(dist_context, rank_id) completer = Completer(dist_context) complete_train_program = completer.complete_forward_annotation( train_program) dist_context.block_state.parse_forward_blocks(complete_train_program) partitioned_main_prog, partitioned_startup_prog, partitioned_params_grads = partitioner.partition( complete_train_program, startup_program, []) resharder = Resharder(partitioned_main_prog, partitioned_startup_prog, rank_id, dist_context, partitioned_params_grads) resharder.reshard() # the x should not be slice self.assertTrue(check_allgather(partitioned_main_prog))
def forward(self, input): if _global_parallel_strategy == "dp_mp_pp": auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _global_process_mesh[0], "dims_mapping": [-1, 1] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh[0], "dims_mapping": [1, -1] }) auto.shard_tensor(self.linear2.weight, dist_attr={ "process_mesh": _global_process_mesh[1], "dims_mapping": [-1, 1] }) auto.shard_tensor(self.linear3.weight, dist_attr={ "process_mesh": _global_process_mesh[1], "dims_mapping": [1, -1] }) out = self.norm(input) out = self.linear0(out) out = F.gelu(out, approximate=True) out = self.linear1(out) out = self.linear2(out) out = F.gelu(out, approximate=True) out = self.linear3(out) return out
def forward(self, input): if _global_parallel_strategy == "pp": auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": PP_MESH_0, "dims_mapping": [-1, -1] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": PP_MESH_1, "dims_mapping": [-1, -1] }) else: auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, -1] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, -1] }) out = self.norm(input) out = self.linear0(out) out = F.gelu(out, approximate=True) out = self.linear1(out) return out
def forward(self, input): auto.shard_tensor(self.word_embeddings.weight, dist_attr={ "process_mesh": PP_MESH_0, "dims_mapping": [0, -1] }) auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": PP_MESH_0, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": PP_MESH_1, "dims_mapping": [0, -1] }) auto.shard_tensor(self.linear2.weight, dist_attr={ "process_mesh": PP_MESH_1, "dims_mapping": [0, -1] }) w_out = self.word_embeddings(input) out = self.linear0(w_out) gelu_out = F.gelu(out, approximate=True) out = self.linear1(gelu_out) out1 = self.linear2(gelu_out) out = out + out1 return out
def make_program_dp2(): main_program = paddle.fluid.Program() start_program = paddle.fluid.Program() with paddle.static.program_guard(main_program, start_program): x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32') auto.shard_tensor(x, dist_attr={ "process_mesh": auto.ProcessMesh([0, 1]), "dims_mapping": [0, -1, -1] }) tmp_0 = x[0] tmp_1 = x[:, 0, :] tmp_2 = x[:, :, 1] tmp_3 = x[:2, :2, :2] return main_program, start_program
def make_program_dp2(): main_program = paddle.fluid.Program() start_program = paddle.fluid.Program() with paddle.static.program_guard(main_program, start_program): x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32') x.stop_gradient = False auto.shard_tensor(x, dist_attr={ "process_mesh": auto.ProcessMesh([0, 1]), "dims_mapping": [0, -1, -1] }) tmp_0 = paddle.reshape(x, shape=[0, 0, 4, 2]) tmp_1 = paddle.reshape(tmp_0, shape=[0, 0, 8]) tmp_2 = tmp_1.reshape((tmp_1.shape[0], tmp_1.shape[1], -1)) return main_program, start_program
def mlp_forward(input, label, hidden_size): if _global_parallel_strategy == "dp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, initializer_range=0.02) predict = mlp(input) error_cost = paddle.nn.functional.square_error_cost(predict, label) loss = paddle.mean(error_cost) return loss
def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): residual = tgt if self.normalize_before: tgt = self.norm1(tgt) if use_cache is False: tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) else: tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) tgt = residual + self.dropout1(tgt) if not self.normalize_before: tgt = self.norm1(tgt) residual = tgt if self.normalize_before: tgt = self.norm2(tgt) if _global_parallel_strategy == "mp": auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) if _global_parallel_strategy == "mp": auto.shard_tensor(self.linear2.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.linear2.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) # tgt = self.dropout2( # self.linear2(F.gelu( # self.linear1(tgt), approximate=True))) tgt = self.linear1(tgt) tgt = F.gelu(tgt, approximate=True) tgt = self.dropout2(self.linear2(tgt)) tgt = residual + tgt if not self.normalize_before: tgt = self.norm2(tgt) return tgt if use_cache is False else (tgt, incremental_cache)
def forward(self, input): out = self.norm(input) auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _g_process_mesh[0], "dims_mapping": [-1, 0] }) out = self.linear0(out) out = F.gelu(out, approximate=True) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _g_process_mesh[1], "dims_mapping": [0, -1] }) out = self.linear1(out) return out
def forward(self, input): auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": PP_MESH_0, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": PP_MESH_1, "dims_mapping": [1, -1] }) out = self.norm(input) out = self.linear0(out) out = F.gelu(out, approximate=True) out = self.linear1(out) return out
def _set_data_parallel(self, var): if self._nranks == 1: self._default_strategy = 'serial' auto.shard_tensor(var, dist_attr={ "process_mesh": [0], "dims_mapping": [-1 for _ in range(len(var.shape))] }) else: self._default_strategy = 'dp' auto.shard_tensor(var, dist_attr={ "process_mesh": list(range(self._nranks)), "dims_mapping": [0] + [-1 for _ in range(len(var.shape) - 1)] }) return var
def _prepare_qkv(self, query, key, value, use_cache=False, cache=None): r""" Prapares linear projected queries, keys and values for usage of subsequnt multiple parallel attention. If `cache` is not None, using cached results to reduce redundant calculations. """ q = self.q_proj(query) if _global_parallel_strategy == "mp": auto.shard_tensor( self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor( self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) if isinstance(cache, self.StaticCache): # for encoder-decoder attention in inference and has cached k, v = cache.k, cache.v else: k, v = self.compute_kv(key, value) if isinstance(cache, self.Cache): # for decoder self-attention in inference k = tensor.concat([cache.k, k], axis=2) v = tensor.concat([cache.v, v], axis=2) if use_cache is True: cache = self.Cache(k, v) return (q, k, v) if use_cache is False else (q, k, v, cache)
def compute_kv(self, key, value): r""" Applies linear projection on input keys and values, then splits heads (reshape and transpose) to get keys and values from different representation subspaces. The results are used as key-values pairs for subsequent multiple parallel attention. It is part of calculations in multi-head attention, and is provided as a method to pre-compute and prefetch these results, thus we can use them to construct cache for inference. """ k = self.k_proj(key) if _global_parallel_strategy == "mp": auto.shard_tensor( self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor( self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) v = self.v_proj(value) if _global_parallel_strategy == "mp": auto.shard_tensor( self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor( self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) return k, v
def create_model(train_program, start_program): with paddle.static.program_guard(train_program, start_program): MESH_0 = auto.ProcessMesh([0, 1]) input = paddle.static.data(name='input', shape=[8, 8]) label = paddle.static.data(name='label', shape=[8, 8]) weight_attr = paddle.ParamAttr( initializer=nn.initializer.Normal(mean=0.0, std=0.02)) linear0 = nn.Linear(8, 8, weight_attr) linear1 = nn.Linear(8, 8, weight_attr) auto.shard_tensor(input, dist_attr={ "process_mesh": MESH_0, "dims_mapping": [-1, -1] }) auto.shard_tensor(label, dist_attr={ "process_mesh": MESH_0, "dims_mapping": [-1, -1] }) auto.shard_tensor(linear0.weight, dist_attr={ "process_mesh": MESH_0, "dims_mapping": [-1, 0] }) auto.shard_tensor(linear1.weight, dist_attr={ "process_mesh": MESH_0, "dims_mapping": [0, -1] }) linear0_out = linear0(input) gelu_out = F.gelu(linear0_out) linear1_out = linear1(gelu_out) error_cost = paddle.nn.functional.square_error_cost( linear1_out, label) loss = paddle.mean(error_cost) return train_program, start_program, loss, input, label
def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=False, cache=None): self.checkpoints = [] if position_ids is None: past_length = 0 if cache is not None: past_length = paddle.shape(cache[0].k)[-2] position_ids = paddle.arange(past_length, paddle.shape(input_ids)[-1] + past_length, dtype='int64') position_ids = position_ids.unsqueeze(0) position_ids = paddle.fluid.layers.expand_as( position_ids, input_ids) embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids) if _global_parallel_strategy == "pp": auto.shard_tensor(input_ids, dist_attr={ "process_mesh": PP_MESH_LIST[0], "dims_mapping": [-1 for i in range(len(input_ids.shape))] }) if _global_parallel_strategy == "dp_pp": auto.shard_tensor( input_ids, dist_attr={ "process_mesh": DPPP_MESH_LIST[0], "dims_mapping": [0] + [-1 for i in range(len(input_ids.shape) - 1)] }) if _global_parallel_strategy == "dp_mp_pp": auto.shard_tensor( input_ids, dist_attr={ "process_mesh": DPMPPP_MESH_LIST[0], "dims_mapping": [0] + [-1 for i in range(len(input_ids.shape) - 1)] }) encoder_outputs = self.decoder(embedding_output, memory=None, tgt_mask=attention_mask, use_cache=use_cache, cache=cache) self.checkpoints.extend(self.decoder.checkpoints) return encoder_outputs
def get_program(): dist_strategy = fleet.DistributedStrategy() dist_strategy.semi_auto = True # fleet.init(is_collective=True, strategy=dist_strategy) train_program = static.Program() start_program = static.Program() with static.program_guard(train_program, start_program): # input input = static.data( name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') label = static.data( name="label", shape=[batch_size, sequence_len, 1], dtype='float32') data_holder = [input, label] # dataloader dataloader = paddle.io.DataLoader.from_generator( feed_list=data_holder, capacity=4 * batch_size, iterable=False) dataloader.set_batch_generator( batch_generator_creator(), places=paddle.static.cuda_places()) # data dist_attr auto.shard_tensor( input, dist_attr={ "process_mesh": _g_process_mesh[0], "dims_mapping": [0, -1, -1] }) auto.shard_tensor( label, dist_attr={ "process_mesh": _g_process_mesh[0], "dims_mapping": [0, -1, -1] }) mlp_start = MLPLayer( hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) pred = mlp_start(input) mlp_mid = MLPLayer( hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) pred = mlp_mid(pred) mlp_end = MLPLayer( hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) pred = mlp_end(pred) error_cost = paddle.nn.functional.square_error_cost(pred, label) loss = paddle.mean(error_cost) optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08, grad_clip=None) feed_vars = {"inputs": [input], "labels": [label]} fetch_vars = {"loss": [loss]} return train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars
def forward(self, query, key, value, attn_mask=None, use_cache=False, cache=None): """ Applies multi-head attention to map queries and a set of key-value pairs to outputs. """ key = query if key is None else key value = query if value is None else value # compute q ,k ,v if use_cache is False: if self.fuse: q, k, v = self._fuse_prepare_qkv(query) else: q, k, v = self._prepare_qkv(query, key, value, use_cache, cache) else: q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache) product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) if attn_mask is not None: product = product + attn_mask weights = F.softmax(product) if self.dropout: weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train") out = tensor.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) if _global_parallel_strategy == "mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) elif _global_parallel_strategy == "mp_pp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": MPPP_MESH_LIST[self.mesh_idx], "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp_pp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": DPMPPP_MESH_LIST[self.mesh_idx], "dims_mapping": [1, -1] }) outs = [out] if self.need_weights: outs.append(weights) if use_cache: outs.append(cache) return out if len(outs) == 1 else tuple(outs)
def get_gpt_model(self, strategy, place, batch_size, sequence_len, vocab_size): modeling.init_global() if strategy == "dp": modeling._global_parallel_strategy = "dp" modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) elif strategy == "mp": modeling._global_parallel_strategy = "mp" modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) else: raise ValueError("'get_gpt_model' only support dp and mp.") tokens = paddle.static.data(name="tokens", shape=[batch_size, sequence_len], dtype='int64') position_ids = paddle.static.data(name="position_ids", shape=[batch_size, sequence_len], dtype='int64') attention_mask = paddle.static.data( name="attention_mask", shape=[batch_size, 1, sequence_len, sequence_len], dtype='float32') labels = paddle.static.data(name="labels", shape=[batch_size, sequence_len], dtype='int64') loss_mask = paddle.static.data(name="loss_mask", shape=[batch_size, sequence_len], dtype='float32') data_holder = [tokens, position_ids, attention_mask, labels, loss_mask] if modeling._global_parallel_strategy == "dp": auto.shard_tensor(tokens, dist_attr={ "process_mesh": modeling._global_process_mesh, "dims_mapping": [0, -1] }) elif modeling._global_parallel_strategy == "pp": auto.shard_tensor(tokens, dist_attr={ "process_mesh": modeling.PP_MESH_LIST[0], "dims_mapping": [-1, -1] }) auto.shard_tensor(attention_mask, dist_attr={ "process_mesh": modeling.PP_MESH_LIST[0], "dims_mapping": [-1, -1, -1, -1] }) gpt = GPTModel(vocab_size=1000, hidden_size=64, num_hidden_layers=2, num_attention_heads=8, intermediate_size=256, hidden_act="gelu", hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, max_position_embeddings=1024, type_vocab_size=1, initializer_range=0.02, pad_token_id=0, eos_token_id=7, bos_token_id=0, eol_token_id=3) model = GPTForPretraining(gpt, vocab_size=1000, hidden_size=64, initializer_range=0.02) preds = model(tokens, position_ids, attention_mask) criterion = GPTPretrainingCriterion() loss = criterion(preds, labels, loss_mask) clip = paddle.nn.ClipGradByNorm(clip_norm=1.0) optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, beta1=0.9, beta2=0.999, epsilon=1e-08, grad_clip=clip) optimizer = fleet.distributed_optimizer(optimizer) startup_program = paddle.static.default_startup_program() _, _, dist_startup_prog, dist_main_prog = optimizer.minimize( loss, startup_program) def gen_data(): np.random.seed(2021) for _ in range(10): tokens = [] position_ids = [] attention_mask = [] labels = [] loss_mask = [] for _ in range(batch_size): tokens.append( np.random.randint(vocab_size, size=sequence_len)) position_ids.append(np.arange(sequence_len)) attention_mask.append([np.tril(np.ones(sequence_len))]) labels.append( np.random.randint(vocab_size, size=sequence_len)) loss_mask.append(np.ones(sequence_len)) yield tokens, position_ids, attention_mask, labels, loss_mask return dist_main_prog, dist_startup_prog, data_holder, [loss], gen_data
def forward(self, input_ids, position_ids): if _global_parallel_strategy == "dp": auto.shard_tensor(input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) input_embeddings = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) if _global_parallel_strategy == "mp": auto.shard_tensor(self.word_embeddings.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.word_embeddings.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) embeddings = input_embeddings + position_embeddings embeddings = self.dropout1(embeddings) # Pre-norm target = self.norm1(embeddings) # The following is the attention part q = self.q_proj(target) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) k = self.k_proj(target) v = self.v_proj(target) if _global_parallel_strategy == "mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) # scale dot product attention product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) if self.attn_mask is not None: product = product + self.attn_mask weights = F.softmax(product) if self.dropout_ratio: weights = F.dropout(weights, self.dropout_ratio, training=self.training, mode="upscale_in_train") out = tensor.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) if _global_parallel_strategy == "mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) # Add residual residual = embeddings + self.dropout2(out) # Pre-norm out0 = self.norm2(residual) # The following is the MLP part out1 = self.linear0(out0) out2 = F.gelu(out1, approximate=True) out3 = self.linear1(out2) if _global_parallel_strategy == "mp": auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) # Add residual final = residual + self.dropout3(out3) return final
def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, use_cache=False, cache=None): """ Applies a stack of N Transformer decoder layers on inputs. If `norm` is provided, also applies layer normalization on the output of last decoder layer. """ output = tgt new_caches = [] self.checkpoints = [] if _global_parallel_strategy == "pp": auto.shard_tensor(output, dist_attr={ "process_mesh": PP_MESH_LIST[0], "dims_mapping": [-1 for i in range(len(output.shape))] }) if _global_parallel_strategy == "dp_pp": auto.shard_tensor(output, dist_attr={ "process_mesh": DPPP_MESH_LIST[0], "dims_mapping": [0] + [-1 for i in range(len(output.shape) - 1)] }) if _global_parallel_strategy == "mp_pp": auto.shard_tensor(output, dist_attr={ "process_mesh": MPPP_MESH_LIST[0], "dims_mapping": [-1] + [-1 for i in range(len(output.shape) - 1)] }) if _global_parallel_strategy == "dp_mp_pp": auto.shard_tensor(output, dist_attr={ "process_mesh": DPMPPP_MESH_LIST[0], "dims_mapping": [0] + [-1 for i in range(len(output.shape) - 1)] }) for i, mod in enumerate(self.layers): if cache is None: if use_cache: if _global_parallel_strategy == "pp": output, new_cache = auto.shard_op( mod, dist_attr={ "process_mesh": PP_MESH_LIST[mod.mesh_idx] })(output, memory, tgt_mask, use_cache, cache)[0] auto.shard_tensor( output, dist_attr={ "process_mesh": PP_MESH_LIST[mod.mesh_idx], "dims_mapping": [-1 for i in range(len(output.shape))] }) elif _global_parallel_strategy == "dp_pp": output, new_cache = auto.shard_op( mod, dist_attr={ "process_mesh": DPPP_MESH_LIST[mod.mesh_idx] })(output, memory, tgt_mask, use_cache, cache)[0] auto.shard_tensor( output, dist_attr={ "process_mesh": DPPP_MESH_LIST[mod.mesh_idx], "dims_mapping": [0] + [-1 for i in range(len(output.shape) - 1)] }) elif _global_parallel_strategy == "mp_pp": output, new_cache = auto.shard_op( mod, dist_attr={ "process_mesh": MPPP_MESH_LIST[mod.mesh_idx] })(output, memory, tgt_mask, use_cache, cache)[0] auto.shard_tensor( output, dist_attr={ "process_mesh": MPPP_MESH_LIST[mod.mesh_idx], "dims_mapping": [-1] + [-1 for i in range(len(output.shape) - 1)] }) elif _global_parallel_strategy == "dp_mp_pp": output, new_cache = auto.shard_op( mod, dist_attr={ "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx] })(output, memory, tgt_mask, use_cache, cache)[0] auto.shard_tensor( output, dist_attr={ "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx], "dims_mapping": [0] + [-1 for i in range(len(output.shape) - 1)] }) else: output, new_cache = mod(output, memory, tgt_mask=tgt_mask, use_cache=use_cache, cache=cache) new_caches.append(new_cache) else: if _global_parallel_strategy == "pp": output = auto.shard_op(mod, dist_attr={ "process_mesh": PP_MESH_LIST[mod.mesh_idx] })(output, memory, tgt_mask, use_cache, cache)[0] auto.shard_tensor( output, dist_attr={ "process_mesh": PP_MESH_LIST[mod.mesh_idx], "dims_mapping": [-1 for i in range(len(output.shape))] }) elif _global_parallel_strategy == "dp_pp": output = auto.shard_op(mod, dist_attr={ "process_mesh": DPPP_MESH_LIST[mod.mesh_idx] })(output, memory, tgt_mask, use_cache, cache)[0] auto.shard_tensor( output, dist_attr={ "process_mesh": DPPP_MESH_LIST[mod.mesh_idx], "dims_mapping": [0] + [-1 for i in range(len(output.shape) - 1)] }) elif _global_parallel_strategy == "mp_pp": output = auto.shard_op(mod, dist_attr={ "process_mesh": MPPP_MESH_LIST[mod.mesh_idx] })(output, memory, tgt_mask, use_cache, cache)[0] auto.shard_tensor( output, dist_attr={ "process_mesh": MPPP_MESH_LIST[mod.mesh_idx], "dims_mapping": [-1] + [-1 for i in range(len(output.shape) - 1)] }) elif _global_parallel_strategy == "dp_mp_pp": output = auto.shard_op( mod, dist_attr={ "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx] })(output, memory, tgt_mask, use_cache, cache)[0] auto.shard_tensor( output, dist_attr={ "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx], "dims_mapping": [0] + [-1 for i in range(len(output.shape) - 1)] }) else: output = mod(output, memory, tgt_mask=tgt_mask, use_cache=use_cache, cache=cache) else: if _global_parallel_strategy == "pp": output, new_cache = auto.shard_op( mod, dist_attr={"process_mesh": PP_MESH_LIST[mod.mesh_idx] })(output, memory, tgt_mask, use_cache, cache) auto.shard_tensor( output, dist_attr={ "process_mesh": PP_MESH_LIST[mod.mesh_idx], "dims_mapping": [-1 for i in range(len(output.shape))] }) elif _global_parallel_strategy == "dp_pp": output, new_cache = auto.shard_op( mod, dist_attr={ "process_mesh": DPPP_MESH_LIST[mod.mesh_idx] })(output, memory, tgt_mask, use_cache, cache) auto.shard_tensor( output, dist_attr={ "process_mesh": DPPP_MESH_LIST[mod.mesh_idx], "dims_mapping": [0] + [-1 for i in range(len(output.shape) - 1)] }) elif _global_parallel_strategy == "mp_pp": output, new_cache = auto.shard_op( mod, dist_attr={ "process_mesh": MPPP_MESH_LIST[mod.mesh_idx] })(output, memory, tgt_mask, use_cache, cache) auto.shard_tensor( output, dist_attr={ "process_mesh": MPPP_MESH_LIST[mod.mesh_idx], "dims_mapping": [-1] + [-1 for i in range(len(output.shape) - 1)] }) elif _global_parallel_strategy == "dp_mp_pp": output, new_cache = auto.shard_op( mod, dist_attr={ "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx] })(output, memory, tgt_mask, use_cache, cache) auto.shard_tensor( output, dist_attr={ "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx], "dims_mapping": [0] + [-1 for i in range(len(output.shape) - 1)] }) else: output, new_cache = mod(output, memory, tgt_mask=tgt_mask, use_cache=use_cache, cache=cache[i]) new_caches.append(new_cache) self.checkpoints.append(output.name) if self.norm is not None: output = self.norm(output) return output if use_cache is False else (output, new_caches)
def get_program(): dist_strategy = fleet.DistributedStrategy() dist_strategy.semi_auto = True # fleet.init(is_collective=True, strategy=dist_strategy) train_program = static.Program() start_program = static.Program() with fluid.program_guard(train_program, start_program): # 循环计数器 i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0) auto.shard_tensor(i, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1] }) # 循环次数 loop_len = fluid.layers.fill_constant(shape=[1], dtype='int64', value=epoch_num) auto.shard_tensor(loop_len, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1] }) # input input = static.data(name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, sequence_len, 1], dtype='float32') data_holder = [input, label] # dataloader dataloader = paddle.io.DataLoader.from_generator(feed_list=data_holder, capacity=4 * batch_size, iterable=False) dataloader.set_batch_generator(batch_generator_creator(), places=paddle.static.cuda_places()) # data dist_attr auto.shard_tensor(input, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) auto.shard_tensor(label, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) # fill constant bsz like tmp = paddle.fluid.layers.fill_constant_batch_size_like( input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0) auto.shard_tensor(tmp, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, 0, -1, -1] }) # model mlp_start = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) pred = mlp_start(input) input_array = fluid.layers.array_write(pred, i) auto.shard_tensor(input_array, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) cond = fluid.layers.less_than(x=i, y=loop_len) auto.shard_tensor(cond, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1] }) while_op = fluid.layers.While(cond=cond) with while_op.block(): pre_input = fluid.layers.array_read(array=input_array, i=i) auto.shard_tensor(pre_input, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) mlp_while = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) cur_pred = mlp_while(pre_input) # 更新循环条件 i = fluid.layers.increment(x=i, value=1, in_place=True) fluid.layers.array_write(cur_pred, array=input_array, i=i) fluid.layers.less_than(x=i, y=loop_len, cond=cond) end_pred = fluid.layers.array_read(array=input_array, i=i) auto.shard_tensor(end_pred, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) mlp_end = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) pred = mlp_end(end_pred) error_cost = paddle.nn.functional.square_error_cost(pred, label) auto.shard_tensor(error_cost, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) loss = paddle.mean(error_cost) auto.shard_tensor(loss, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1] }) return train_program, start_program, dataloader, i, loss
def forward(self, input): if _global_parallel_strategy == "dp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1, -1] }) q = self.q_proj(input) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) k = self.k_proj(input) v = self.v_proj(input) if _global_parallel_strategy == "mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) # scale dot product attention product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) if self.attn_mask is not None: product = product + self.attn_mask weights = F.softmax(product) if self.dropout_ratio: weights = F.dropout(weights, self.dropout_ratio, training=self.training, mode="upscale_in_train") out = tensor.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) if _global_parallel_strategy == "mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) return out