def forward(self, input):
        if _global_parallel_strategy == "dp_mp_pp":
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh[0],
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh[0],
                                  "dims_mapping": [1, -1]
                              })
            auto.shard_tensor(self.linear2.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh[1],
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.linear3.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh[1],
                                  "dims_mapping": [1, -1]
                              })

        out = self.norm(input)
        out = self.linear0(out)
        out = F.gelu(out, approximate=True)
        out = self.linear1(out)

        out = self.linear2(out)
        out = F.gelu(out, approximate=True)
        out = self.linear3(out)
        return out
    def forward(self, input):
        out = self.norm0(input)
        out = self.linear0(out)
        out = F.gelu(out, approximate=True)
        out = self.linear1(out)

        out = self.norm1(out)
        out = self.linear2(out)
        out = F.gelu(out, approximate=True)
        out = self.linear3(out)

        out = self.norm2(out)
        out = self.linear4(out)
        out = F.gelu(out, approximate=True)
        out = self.linear5(out)
        return out
Example #3
0
    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
        residual = tgt

        if self.normalize_before:
            tgt = self.norm1(tgt)

        if use_cache is False:
            tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
        else:
            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
                                                    use_cache, cache)
        tgt = residual + self.dropout1(tgt)
        if not self.normalize_before:
            tgt = self.norm1(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm2(tgt)
        tgt = self.dropout2(
            self.linear2(F.gelu(self.linear1(tgt), approximate=True)))
        tgt = residual + tgt

        if not self.normalize_before:
            tgt = self.norm2(tgt)

        return tgt if use_cache is False else (tgt, incremental_cache)
    def forward(self, input):
        out = self.norm(input)
        out = self.linear0(out)
        out = F.gelu(out, approximate=True)
        out = self.linear1(out)

        return out
    def forward(self, input):
        if _global_parallel_strategy == "pp":
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": PP_MESH_0,
                                  "dims_mapping": [-1, -1]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": PP_MESH_1,
                                  "dims_mapping": [-1, -1]
                              })
        else:
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, -1]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, -1]
                              })

        out = self.norm(input)
        out = self.linear0(out)
        out = F.gelu(out, approximate=True)
        out = self.linear1(out)

        return out
Example #6
0
 def run_gelu_op(approximate):
     with dg.guard():
         x = paddle.to_tensor(x_np)
         x.stop_gradient = False
         y = F.gelu(x, approximate=approximate)
         x_grad = paddle.grad([y], [x], [paddle.to_tensor(y_g_np)])[0]
         return y.numpy(), x_grad.numpy()
    def forward(self, input):
        auto.shard_tensor(self.word_embeddings.weight,
                          dist_attr={
                              "process_mesh": PP_MESH_0,
                              "dims_mapping": [0, -1]
                          })
        auto.shard_tensor(self.linear0.weight,
                          dist_attr={
                              "process_mesh": PP_MESH_0,
                              "dims_mapping": [-1, 0]
                          })
        auto.shard_tensor(self.linear1.weight,
                          dist_attr={
                              "process_mesh": PP_MESH_1,
                              "dims_mapping": [0, -1]
                          })
        auto.shard_tensor(self.linear2.weight,
                          dist_attr={
                              "process_mesh": PP_MESH_1,
                              "dims_mapping": [0, -1]
                          })
        w_out = self.word_embeddings(input)
        out = self.linear0(w_out)
        gelu_out = F.gelu(out, approximate=True)
        out = self.linear1(gelu_out)
        out1 = self.linear2(gelu_out)
        out = out + out1

        return out
Example #8
0
    def forward(self, input):

        auto.shard_tensor(self.norm.weight,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1]
                          })
        auto.shard_tensor(self.norm.bias,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1]
                          })
        auto.shard_tensor(self.linear0.weight,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, 0]
                          })
        auto.shard_tensor(self.linear0.bias,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [0]
                          })
        auto.shard_tensor(self.linear1.weight,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [0, -1]
                          })
        auto.shard_tensor(self.linear1.bias,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1]
                          })

        out = self.norm(input)
        auto.shard_tensor(out,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, -1]
                          })
        out = self.linear0(out)
        auto.shard_tensor(out,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, 0]
                          })
        out = F.gelu(out, approximate=True)
        auto.shard_tensor(out,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, 0]
                          })
        out = self.linear1(out)
        auto.shard_tensor(out,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, -1]
                          })

        return out
 def forward(self, hidden_states):
     """
     Latent block
     """
     hidden_states = self.connecter(hidden_states)
     #hidden_states = F.relu(hidden_states)
     hidden_states = F.gelu(hidden_states)
     return hidden_states
 def forward(self, input):
     out = self.norm(input)
     out = self.linear0(out)
     out = F.gelu(out, approximate=True)
     out = self.linear1(out)
     out = paddle.unsqueeze(out, axis=0)
     out = paddle.reshape(out, [4, 1024])
     return out
Example #11
0
    def forward(self, features, **kwargs):
        x = self.dense(features)
        x = F.gelu(x)
        x = self.layer_norm(x)

        # project back to size of vocabulary with bias
        x = self.decoder(x)

        return x
    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
        residual = tgt

        if self.normalize_before:
            tgt = self.norm1(tgt)

        if use_cache is False:
            tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
        else:
            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
                                                    use_cache, cache)
        tgt = residual + self.dropout1(tgt)
        if not self.normalize_before:
            tgt = self.norm1(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm2(tgt)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.linear2.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.linear2.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        # tgt = self.dropout2(
        #     self.linear2(F.gelu(
        #         self.linear1(tgt), approximate=True)))
        tgt = self.linear1(tgt)
        tgt = F.gelu(tgt, approximate=True)
        tgt = self.dropout2(self.linear2(tgt))
        tgt = residual + tgt

        if not self.normalize_before:
            tgt = self.norm2(tgt)

        return tgt if use_cache is False else (tgt, incremental_cache)
Example #13
0
 def forward(self, input):
     out = auto.shard_op(self.norm, dist_attr={"process_mesh":
                                               PP_MESH_0})(input)[0]
     out = self.linear0(input)
     out = F.gelu(out, approximate=True)
     out = auto.shard_op(self.linear1,
                         dist_attr={"process_mesh": PP_MESH_1})(out)[0]
     out = self.dropout(out)
     out = self.linear2(out)
     return out
    def forward(self, input):
        auto.shard_tensor(self.linear0.weight,
                          dist_attr={
                              "process_mesh": PP_MESH_0,
                              "dims_mapping": [-1, 1]
                          })
        auto.shard_tensor(self.linear1.weight,
                          dist_attr={
                              "process_mesh": PP_MESH_1,
                              "dims_mapping": [1, -1]
                          })

        out = self.norm(input)
        out = self.linear0(out)
        out = F.gelu(out, approximate=True)
        out = self.linear1(out)

        return out
    def forward(self, x):
        q = self.q_proj(x)
        k = self.k_proj(x)
        v = self.v_proj(x)
        product = layers.matmul(x=q,
                                y=k,
                                transpose_y=True,
                                alpha=d_model**-0.5)
        weights = F.softmax(product)

        weights = F.dropout(weights, 0.2)
        tgt = layers.matmul(weights, v)
        residual = tgt
        tgt = self.norm1(tgt)
        tgt = residual + tgt

        out = self.linear2(F.gelu(self.linear1(tgt), approximate=True))
        return out
Example #16
0
        def create_model(train_program, start_program):
            with paddle.static.program_guard(train_program, start_program):

                MESH_0 = auto.ProcessMesh([0, 1])
                input = paddle.static.data(name='input', shape=[8, 8])
                label = paddle.static.data(name='label', shape=[8, 8])

                weight_attr = paddle.ParamAttr(
                    initializer=nn.initializer.Normal(mean=0.0, std=0.02))
                linear0 = nn.Linear(8, 8, weight_attr)
                linear1 = nn.Linear(8, 8, weight_attr)

                auto.shard_tensor(input,
                                  dist_attr={
                                      "process_mesh": MESH_0,
                                      "dims_mapping": [-1, -1]
                                  })
                auto.shard_tensor(label,
                                  dist_attr={
                                      "process_mesh": MESH_0,
                                      "dims_mapping": [-1, -1]
                                  })

                auto.shard_tensor(linear0.weight,
                                  dist_attr={
                                      "process_mesh": MESH_0,
                                      "dims_mapping": [-1, 0]
                                  })
                auto.shard_tensor(linear1.weight,
                                  dist_attr={
                                      "process_mesh": MESH_0,
                                      "dims_mapping": [0, -1]
                                  })

                linear0_out = linear0(input)
                gelu_out = F.gelu(linear0_out)

                linear1_out = linear1(gelu_out)

                error_cost = paddle.nn.functional.square_error_cost(
                    linear1_out, label)
                loss = paddle.mean(error_cost)
                return train_program, start_program, loss, input, label
    def forward(self, x, mask):
        q = self.q_proj(x)
        k = self.k_proj(x)
        v = self.v_proj(x)
        product = layers.matmul(x=q,
                                y=k,
                                transpose_y=True,
                                alpha=d_model**-0.5)

        weights = F.softmax(product + mask)
        # TODO(shenliang03) For save/load in PipeLineParallel, can’t support dropout temporarily.
        # weights = F.dropout(weights, 0.2)
        tgt = layers.matmul(weights, v)
        residual = tgt
        tgt = self.norm1(tgt)
        tgt = residual + tgt

        out = self.linear2(F.gelu(self.linear1(tgt), approximate=True))
        return out
Example #18
0
    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
        if self._fuse:
            if isinstance(cache, self.Cache):
                attn_output, cache_kv_out = self.self_attn(tgt,
                                                           attn_mask=tgt_mask,
                                                           cache=cache.kv)

                ## if not assign here, update caches in While loop
                # layers.assign(cache_kv_out, cache.kv)
                if use_cache:
                    cache = self.Cache(cache_kv_out)
            else:
                attn_output = self.self_attn(tgt, attn_mask=tgt_mask)

            enc_out = self.ffn(attn_output)
            return (enc_out, cache) if use_cache else enc_out

        residual = tgt

        if self.normalize_before:
            tgt = self.norm1(tgt)

        if use_cache is False:
            tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
        else:
            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
                                                    use_cache, cache)
        tgt = residual + self.dropout1(tgt)
        if not self.normalize_before:
            tgt = self.norm1(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm2(tgt)
        tgt = self.dropout2(
            self.linear2(F.gelu(self.linear1(tgt), approximate=True)))
        tgt = residual + tgt

        if not self.normalize_before:
            tgt = self.norm2(tgt)

        return tgt if use_cache is False else (tgt, incremental_cache)
Example #19
0
    def forward(self,
                tgt,
                memory=None,
                tgt_mask=None,
                use_cache=False,
                cache=None):
        residual = tgt

        if self.normalize_before:
            tgt = self.norm1(tgt)

        if use_cache is False:
            tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
        else:
            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
                                                    use_cache, cache)

        with get_rng_state_tracker().rng_state('global_seed'):
            tgt = residual + self.dropout1(tgt)

        if not self.normalize_before:
            tgt = self.norm1(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm2(tgt)

        if self.expert_mode:
            tgt = self.moe_mlp(tgt)
        else:
            with get_rng_state_tracker().rng_state('global_seed'):
                tgt = self.dropout2(
                    self.linear2(F.gelu(self.linear1(tgt), approximate=True)))

        tgt = residual + tgt

        if not self.normalize_before:
            tgt = self.norm2(tgt)

        return tgt if use_cache is False else (tgt, incremental_cache)
Example #20
0
def gelu_new(x):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
    """
    return F.gelu(x, approximate=True)
Example #21
0
    def forward(self, input_ids, position_ids):
        if _global_parallel_strategy == "dp":
            auto.shard_tensor(input_ids,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(input_ids,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })

        input_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.word_embeddings.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.word_embeddings.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        embeddings = input_embeddings + position_embeddings
        embeddings = self.dropout1(embeddings)

        # Pre-norm
        target = self.norm1(embeddings)

        # The following is the attention part
        q = self.q_proj(target)
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

        k = self.k_proj(target)
        v = self.v_proj(target)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })

        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])

        # scale dot product attention
        product = layers.matmul(x=q,
                                y=k,
                                transpose_y=True,
                                alpha=self.head_dim**-0.5)

        if self.attn_mask is not None:
            product = product + self.attn_mask

        weights = F.softmax(product)

        if self.dropout_ratio:
            weights = F.dropout(weights,
                                self.dropout_ratio,
                                training=self.training,
                                mode="upscale_in_train")

        out = tensor.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.out_proj(out)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        # Add residual
        residual = embeddings + self.dropout2(out)

        # Pre-norm
        out0 = self.norm2(residual)

        # The following is the MLP part
        out1 = self.linear0(out0)
        out2 = F.gelu(out1, approximate=True)
        out3 = self.linear1(out2)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        # Add residual
        final = residual + self.dropout3(out3)
        return final
Example #22
0
 def forward(self, x):
     x = self.htoh4(x)
     x = F.gelu(x, approximate=True)
     x = self.h4toh(x)
     return x