def forward(self, input): if _global_parallel_strategy == "dp_mp_pp": auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _global_process_mesh[0], "dims_mapping": [-1, 1] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh[0], "dims_mapping": [1, -1] }) auto.shard_tensor(self.linear2.weight, dist_attr={ "process_mesh": _global_process_mesh[1], "dims_mapping": [-1, 1] }) auto.shard_tensor(self.linear3.weight, dist_attr={ "process_mesh": _global_process_mesh[1], "dims_mapping": [1, -1] }) out = self.norm(input) out = self.linear0(out) out = F.gelu(out, approximate=True) out = self.linear1(out) out = self.linear2(out) out = F.gelu(out, approximate=True) out = self.linear3(out) return out
def forward(self, input): out = self.norm0(input) out = self.linear0(out) out = F.gelu(out, approximate=True) out = self.linear1(out) out = self.norm1(out) out = self.linear2(out) out = F.gelu(out, approximate=True) out = self.linear3(out) out = self.norm2(out) out = self.linear4(out) out = F.gelu(out, approximate=True) out = self.linear5(out) return out
def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): residual = tgt if self.normalize_before: tgt = self.norm1(tgt) if use_cache is False: tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) else: tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) tgt = residual + self.dropout1(tgt) if not self.normalize_before: tgt = self.norm1(tgt) residual = tgt if self.normalize_before: tgt = self.norm2(tgt) tgt = self.dropout2( self.linear2(F.gelu(self.linear1(tgt), approximate=True))) tgt = residual + tgt if not self.normalize_before: tgt = self.norm2(tgt) return tgt if use_cache is False else (tgt, incremental_cache)
def forward(self, input): out = self.norm(input) out = self.linear0(out) out = F.gelu(out, approximate=True) out = self.linear1(out) return out
def forward(self, input): if _global_parallel_strategy == "pp": auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": PP_MESH_0, "dims_mapping": [-1, -1] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": PP_MESH_1, "dims_mapping": [-1, -1] }) else: auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, -1] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, -1] }) out = self.norm(input) out = self.linear0(out) out = F.gelu(out, approximate=True) out = self.linear1(out) return out
def run_gelu_op(approximate): with dg.guard(): x = paddle.to_tensor(x_np) x.stop_gradient = False y = F.gelu(x, approximate=approximate) x_grad = paddle.grad([y], [x], [paddle.to_tensor(y_g_np)])[0] return y.numpy(), x_grad.numpy()
def forward(self, input): auto.shard_tensor(self.word_embeddings.weight, dist_attr={ "process_mesh": PP_MESH_0, "dims_mapping": [0, -1] }) auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": PP_MESH_0, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": PP_MESH_1, "dims_mapping": [0, -1] }) auto.shard_tensor(self.linear2.weight, dist_attr={ "process_mesh": PP_MESH_1, "dims_mapping": [0, -1] }) w_out = self.word_embeddings(input) out = self.linear0(w_out) gelu_out = F.gelu(out, approximate=True) out = self.linear1(gelu_out) out1 = self.linear2(gelu_out) out = out + out1 return out
def forward(self, input): auto.shard_tensor(self.norm.weight, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1] }) auto.shard_tensor(self.norm.bias, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1] }) auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.linear0.bias, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [0] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [0, -1] }) auto.shard_tensor(self.linear1.bias, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1] }) out = self.norm(input) auto.shard_tensor(out, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) out = self.linear0(out) auto.shard_tensor(out, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, 0] }) out = F.gelu(out, approximate=True) auto.shard_tensor(out, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, 0] }) out = self.linear1(out) auto.shard_tensor(out, dist_attr={ "process_mesh": _g_process_mesh, "dims_mapping": [-1, -1, -1] }) return out
def forward(self, hidden_states): """ Latent block """ hidden_states = self.connecter(hidden_states) #hidden_states = F.relu(hidden_states) hidden_states = F.gelu(hidden_states) return hidden_states
def forward(self, input): out = self.norm(input) out = self.linear0(out) out = F.gelu(out, approximate=True) out = self.linear1(out) out = paddle.unsqueeze(out, axis=0) out = paddle.reshape(out, [4, 1024]) return out
def forward(self, features, **kwargs): x = self.dense(features) x = F.gelu(x) x = self.layer_norm(x) # project back to size of vocabulary with bias x = self.decoder(x) return x
def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): residual = tgt if self.normalize_before: tgt = self.norm1(tgt) if use_cache is False: tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) else: tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) tgt = residual + self.dropout1(tgt) if not self.normalize_before: tgt = self.norm1(tgt) residual = tgt if self.normalize_before: tgt = self.norm2(tgt) if _global_parallel_strategy == "mp": auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) if _global_parallel_strategy == "mp": auto.shard_tensor(self.linear2.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.linear2.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) # tgt = self.dropout2( # self.linear2(F.gelu( # self.linear1(tgt), approximate=True))) tgt = self.linear1(tgt) tgt = F.gelu(tgt, approximate=True) tgt = self.dropout2(self.linear2(tgt)) tgt = residual + tgt if not self.normalize_before: tgt = self.norm2(tgt) return tgt if use_cache is False else (tgt, incremental_cache)
def forward(self, input): out = auto.shard_op(self.norm, dist_attr={"process_mesh": PP_MESH_0})(input)[0] out = self.linear0(input) out = F.gelu(out, approximate=True) out = auto.shard_op(self.linear1, dist_attr={"process_mesh": PP_MESH_1})(out)[0] out = self.dropout(out) out = self.linear2(out) return out
def forward(self, input): auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": PP_MESH_0, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": PP_MESH_1, "dims_mapping": [1, -1] }) out = self.norm(input) out = self.linear0(out) out = F.gelu(out, approximate=True) out = self.linear1(out) return out
def forward(self, x): q = self.q_proj(x) k = self.k_proj(x) v = self.v_proj(x) product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_model**-0.5) weights = F.softmax(product) weights = F.dropout(weights, 0.2) tgt = layers.matmul(weights, v) residual = tgt tgt = self.norm1(tgt) tgt = residual + tgt out = self.linear2(F.gelu(self.linear1(tgt), approximate=True)) return out
def create_model(train_program, start_program): with paddle.static.program_guard(train_program, start_program): MESH_0 = auto.ProcessMesh([0, 1]) input = paddle.static.data(name='input', shape=[8, 8]) label = paddle.static.data(name='label', shape=[8, 8]) weight_attr = paddle.ParamAttr( initializer=nn.initializer.Normal(mean=0.0, std=0.02)) linear0 = nn.Linear(8, 8, weight_attr) linear1 = nn.Linear(8, 8, weight_attr) auto.shard_tensor(input, dist_attr={ "process_mesh": MESH_0, "dims_mapping": [-1, -1] }) auto.shard_tensor(label, dist_attr={ "process_mesh": MESH_0, "dims_mapping": [-1, -1] }) auto.shard_tensor(linear0.weight, dist_attr={ "process_mesh": MESH_0, "dims_mapping": [-1, 0] }) auto.shard_tensor(linear1.weight, dist_attr={ "process_mesh": MESH_0, "dims_mapping": [0, -1] }) linear0_out = linear0(input) gelu_out = F.gelu(linear0_out) linear1_out = linear1(gelu_out) error_cost = paddle.nn.functional.square_error_cost( linear1_out, label) loss = paddle.mean(error_cost) return train_program, start_program, loss, input, label
def forward(self, x, mask): q = self.q_proj(x) k = self.k_proj(x) v = self.v_proj(x) product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_model**-0.5) weights = F.softmax(product + mask) # TODO(shenliang03) For save/load in PipeLineParallel, can’t support dropout temporarily. # weights = F.dropout(weights, 0.2) tgt = layers.matmul(weights, v) residual = tgt tgt = self.norm1(tgt) tgt = residual + tgt out = self.linear2(F.gelu(self.linear1(tgt), approximate=True)) return out
def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): if self._fuse: if isinstance(cache, self.Cache): attn_output, cache_kv_out = self.self_attn(tgt, attn_mask=tgt_mask, cache=cache.kv) ## if not assign here, update caches in While loop # layers.assign(cache_kv_out, cache.kv) if use_cache: cache = self.Cache(cache_kv_out) else: attn_output = self.self_attn(tgt, attn_mask=tgt_mask) enc_out = self.ffn(attn_output) return (enc_out, cache) if use_cache else enc_out residual = tgt if self.normalize_before: tgt = self.norm1(tgt) if use_cache is False: tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) else: tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) tgt = residual + self.dropout1(tgt) if not self.normalize_before: tgt = self.norm1(tgt) residual = tgt if self.normalize_before: tgt = self.norm2(tgt) tgt = self.dropout2( self.linear2(F.gelu(self.linear1(tgt), approximate=True))) tgt = residual + tgt if not self.normalize_before: tgt = self.norm2(tgt) return tgt if use_cache is False else (tgt, incremental_cache)
def forward(self, tgt, memory=None, tgt_mask=None, use_cache=False, cache=None): residual = tgt if self.normalize_before: tgt = self.norm1(tgt) if use_cache is False: tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) else: tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) with get_rng_state_tracker().rng_state('global_seed'): tgt = residual + self.dropout1(tgt) if not self.normalize_before: tgt = self.norm1(tgt) residual = tgt if self.normalize_before: tgt = self.norm2(tgt) if self.expert_mode: tgt = self.moe_mlp(tgt) else: with get_rng_state_tracker().rng_state('global_seed'): tgt = self.dropout2( self.linear2(F.gelu(self.linear1(tgt), approximate=True))) tgt = residual + tgt if not self.normalize_before: tgt = self.norm2(tgt) return tgt if use_cache is False else (tgt, incremental_cache)
def gelu_new(x): """ Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 """ return F.gelu(x, approximate=True)
def forward(self, input_ids, position_ids): if _global_parallel_strategy == "dp": auto.shard_tensor(input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) input_embeddings = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) if _global_parallel_strategy == "mp": auto.shard_tensor(self.word_embeddings.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.word_embeddings.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) embeddings = input_embeddings + position_embeddings embeddings = self.dropout1(embeddings) # Pre-norm target = self.norm1(embeddings) # The following is the attention part q = self.q_proj(target) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) k = self.k_proj(target) v = self.v_proj(target) if _global_parallel_strategy == "mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) # scale dot product attention product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) if self.attn_mask is not None: product = product + self.attn_mask weights = F.softmax(product) if self.dropout_ratio: weights = F.dropout(weights, self.dropout_ratio, training=self.training, mode="upscale_in_train") out = tensor.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) if _global_parallel_strategy == "mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) # Add residual residual = embeddings + self.dropout2(out) # Pre-norm out0 = self.norm2(residual) # The following is the MLP part out1 = self.linear0(out0) out2 = F.gelu(out1, approximate=True) out3 = self.linear1(out2) if _global_parallel_strategy == "mp": auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) # Add residual final = residual + self.dropout3(out3) return final
def forward(self, x): x = self.htoh4(x) x = F.gelu(x, approximate=True) x = self.h4toh(x) return x