コード例 #1
0
    def forward(self, input_ids, position_ids=None):
        if position_ids is None:
            ones = paddle.ones_like(input_ids, dtype="int64")
            seq_length = paddle.cumsum(ones, axis=-1)
            position_ids = seq_length - ones

        input_embedings = self.word_embeddings(input_ids)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(
                self.word_embeddings.weight,
                dist_attr={
                    "process_mesh": _global_process_mesh,
                    "dims_mapping": [0, -1]
                })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(
                self.word_embeddings.weight,
                dist_attr={
                    "process_mesh": _global_process_mesh,
                    "dims_mapping": [1, -1]
                })

        position_embeddings = self.position_embeddings(position_ids)
        embeddings = input_embedings + position_embeddings
        embeddings = self.dropout(embeddings)
        return embeddings
コード例 #2
0
def mlp_forward(train_program, start_program):
    with static.program_guard(train_program,start_program), \
        utils.unique_name.guard():
        batch_size = 4
        hidden_size = 64
        input = static.data(name="input",
                            shape=[batch_size, hidden_size],
                            dtype='float32')
        label = static.data(name="label",
                            shape=[batch_size, 1],
                            dtype='float32')

        if _global_parallel_strategy == "dp_mp_pp":
            auto.shard_tensor(input,
                              dist_attr={
                                  "process_mesh": _global_process_mesh[0],
                                  "dims_mapping": [0, -1]
                              })
        mlp = MLPLayer(hidden_size=hidden_size,
                       intermediate_size=4 * hidden_size,
                       initializer_range=0.02)
        predict = mlp(input)
        error_cost = paddle.nn.functional.square_error_cost(predict, label)
        loss = paddle.mean(error_cost)
    return loss, train_program, start_program
コード例 #3
0
def mlp_forward(train_program, start_program):
    with static.program_guard(train_program,
                              start_program), utils.unique_name.guard():
        batch_size = 4
        hidden_size = 1024
        sequence_len = 512
        input = static.data(name="input", shape=[batch_size], dtype='int32')
        label = static.data(name="label",
                            shape=[batch_size, 1],
                            dtype='float32')

        auto.shard_tensor(input,
                          dist_attr={
                              "process_mesh": PP_MESH_0,
                              "dims_mapping": [-1]
                          })
        auto.shard_tensor(label,
                          dist_attr={
                              "process_mesh": PP_MESH_1,
                              "dims_mapping": [-1, -1]
                          })

        mlp = MLPLayer(hidden_size=hidden_size,
                       intermediate_size=4 * hidden_size,
                       initializer_range=0.02)

        predict = mlp(input)
        error_cost = paddle.nn.functional.square_error_cost(predict, label)
        loss = paddle.mean(error_cost)

    return loss, train_program, start_program
コード例 #4
0
def mlp_pretrain_forward(train_program, start_program):
    with static.program_guard(train_program,
                              start_program), utils.unique_name.guard():
        batch_size = 4
        hidden_size = 1024
        sequence_len = 512
        input = static.data(name="input",
                            shape=[batch_size, sequence_len, hidden_size],
                            dtype='float32')
        label = static.data(name="label",
                            shape=[batch_size, sequence_len, 1],
                            dtype='float32')

        auto.shard_tensor(input,
                          dist_attr={
                              "process_mesh": _global_process_mesh,
                              "dims_mappig": [-1, -1, -1]
                          })

        mlp = MLPLayer(hidden_size=hidden_size,
                       intermediate_size=4 * hidden_size,
                       dropout_ratio=0.1,
                       initializer_range=0.02)

        predict = mlp(input)

        cost = layers.cross_entropy(input=predict, label=label)
        avg_cost = layers.mean(x=cost)

    return avg_cost, train_program, start_program
コード例 #5
0
def mlp_pretrain_forward(train_program, start_program):
    with static.program_guard(train_program,
                              start_program), utils.unique_name.guard():
        input = static.data(name="input",
                            shape=[batch_size, sequence_len, hidden_size],
                            dtype='float32')
        label = static.data(name="label",
                            shape=[batch_size, sequence_len, 1],
                            dtype='float32')

        auto.shard_tensor(input,
                          dist_attr={
                              "process_mesh": _global_process_mesh,
                              "dims_mappig": [-1, -1, -1]
                          })

        mlp = MLPLayer(hidden_size=hidden_size,
                       intermediate_size=4 * hidden_size,
                       dropout_ratio=0.1,
                       initializer_range=0.02)

        predict = mlp(input)
        error_cost = paddle.nn.functional.square_error_cost(predict, label)
        loss = paddle.mean(error_cost)

        loader = paddle.io.DataLoader.from_generator(feed_list=[input, label],
                                                     capacity=4 * batch_size,
                                                     iterable=True)

    return loss, train_program, start_program, loader
コード例 #6
0
def mlp_pretrain_forward(train_program, start_program):
    with static.program_guard(train_program,
                              start_program), utils.unique_name.guard():
        batch_size = 4
        hidden_size = 1024
        sequence_len = 512
        input = static.data(name="input",
                            shape=[batch_size, sequence_len, hidden_size],
                            dtype='float32')

        if _global_parallel_strategy == "dp":
            auto.shard_tensor(input,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(input,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1, -1]
                              })

        mlp = MLPLayer(hidden_size=hidden_size,
                       intermediate_size=4 * hidden_size,
                       dropout_ratio=0.1,
                       initializer_range=0.02)
        out = mlp(input)
    return train_program, start_program
コード例 #7
0
def gpt_pretrain_forward(train_program, start_program):
    with static.program_guard(train_program,
                              start_program), utils.unique_name.guard():
        batch_size = 16
        sequence_len = 512
        input_ids = static.data(
            name="input_ids", shape=[batch_size, sequence_len], dtype='int64')
        position_ids = static.data(
            name="position_ids",
            shape=[batch_size, sequence_len],
            dtype='int64')
        attention_mask = static.data(
            name="attention_mask",
            shape=[batch_size, 1, sequence_len, sequence_len],
            dtype='float64')
        labels = static.data(
            name="labels", shape=[batch_size, sequence_len], dtype='int64')
        loss_mask = static.data(
            name="loss_mask", shape=[batch_size, sequence_len], dtype='float64')

        if _global_parallel_strategy == "dp":
            auto.shard_tensor(
                input_ids,
                dist_attr={
                    "process_mesh": _global_process_mesh,
                    "dims_mapping": [0, -1]
                })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(
                input_ids,
                dist_attr={
                    "process_mesh": _global_process_mesh,
                    "dims_mapping": [0, -1]
                })

        gpt = GPTModel(
            vocab_size=32768,
            hidden_size=1024,
            num_hidden_layers=2,
            num_attention_heads=16,
            intermediate_size=4096,
            hidden_act="gelu",
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
            max_position_embeddings=1024,
            type_vocab_size=16,
            initializer_range=0.02,
            pad_token_id=0,
            topo=None)

        model = GPTForPretraining(gpt)

        preds = model(input_ids, position_ids, attention_mask)

        criterion = GPTPretrainingCriterion()

        loss = criterion(preds, labels, loss_mask)

    return train_program, start_program
コード例 #8
0
def make_program_serial():
    main_program = paddle.fluid.Program()
    start_program = paddle.fluid.Program()
    with paddle.static.program_guard(main_program, start_program):
        x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
        x.stop_gradient = False
        auto.shard_tensor(x,
                          dist_attr={
                              "process_mesh": auto.ProcessMesh([0]),
                              "dims_mapping": [-1, -1, -1]
                          })
        tmp_0 = paddle.norm(x, p=2)
    return main_program, start_program, tmp_0
コード例 #9
0
    def test_allgather(self):
        train_program = paddle.static.Program()
        startup_program = paddle.static.Program()
        process_mesh = auto.ProcessMesh(mesh=[0, 3])
        with static.program_guard(train_program, startup_program):
            x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
            x = auto.shard_tensor(x,
                                  dist_attr={
                                      "process_mesh": process_mesh,
                                      "dims_mapping": [0, -1]
                                  })

            w = paddle.static.data(name="w", shape=[4, 4], dtype='float32')
            w = auto.shard_tensor(w,
                                  dist_attr={
                                      "process_mesh": process_mesh,
                                      "dims_mapping": [-1, -1]
                                  })

            # y = paddle.distributed.shard_op(paddle.matmul, process_mesh, {
            #     x.name: [-1, -1],
            #     w.name: [-1, -1]
            # }, **{"x": x,
            #       "y": w})[0]

            y = paddle.distributed.shard_op(paddle.matmul,
                                            dist_attr={
                                                "process_mesh": process_mesh,
                                                x: {
                                                    "dims_mapping": [-1, -1]
                                                },
                                                w: {
                                                    "dims_mapping": [-1, -1]
                                                }
                                            })(x, w)[0]

        rank_id = 0
        dist_context = DistributedContext()
        dist_strategy = fleet.DistributedStrategy()
        partitioner = Partitioner(dist_context, rank_id)
        completer = Completer(dist_context)
        complete_train_program = completer.complete_forward_annotation(
            train_program)
        dist_context.block_state.parse_forward_blocks(complete_train_program)
        partitioned_main_prog, partitioned_startup_prog, partitioned_params_grads = partitioner.partition(
            complete_train_program, startup_program, [])
        resharder = Resharder(partitioned_main_prog, partitioned_startup_prog,
                              rank_id, dist_context, partitioned_params_grads)
        resharder.reshard()
        # the x should not be slice
        self.assertTrue(check_allgather(partitioned_main_prog))
コード例 #10
0
    def forward(self, input):
        if _global_parallel_strategy == "dp_mp_pp":
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh[0],
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh[0],
                                  "dims_mapping": [1, -1]
                              })
            auto.shard_tensor(self.linear2.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh[1],
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.linear3.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh[1],
                                  "dims_mapping": [1, -1]
                              })

        out = self.norm(input)
        out = self.linear0(out)
        out = F.gelu(out, approximate=True)
        out = self.linear1(out)

        out = self.linear2(out)
        out = F.gelu(out, approximate=True)
        out = self.linear3(out)
        return out
コード例 #11
0
    def forward(self, input):
        if _global_parallel_strategy == "pp":
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": PP_MESH_0,
                                  "dims_mapping": [-1, -1]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": PP_MESH_1,
                                  "dims_mapping": [-1, -1]
                              })
        else:
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, -1]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, -1]
                              })

        out = self.norm(input)
        out = self.linear0(out)
        out = F.gelu(out, approximate=True)
        out = self.linear1(out)

        return out
コード例 #12
0
    def forward(self, input):
        auto.shard_tensor(self.word_embeddings.weight,
                          dist_attr={
                              "process_mesh": PP_MESH_0,
                              "dims_mapping": [0, -1]
                          })
        auto.shard_tensor(self.linear0.weight,
                          dist_attr={
                              "process_mesh": PP_MESH_0,
                              "dims_mapping": [-1, 0]
                          })
        auto.shard_tensor(self.linear1.weight,
                          dist_attr={
                              "process_mesh": PP_MESH_1,
                              "dims_mapping": [0, -1]
                          })
        auto.shard_tensor(self.linear2.weight,
                          dist_attr={
                              "process_mesh": PP_MESH_1,
                              "dims_mapping": [0, -1]
                          })
        w_out = self.word_embeddings(input)
        out = self.linear0(w_out)
        gelu_out = F.gelu(out, approximate=True)
        out = self.linear1(gelu_out)
        out1 = self.linear2(gelu_out)
        out = out + out1

        return out
コード例 #13
0
ファイル: test_dist_slice.py プロジェクト: sandyhouse/Paddle
def make_program_dp2():
    main_program = paddle.fluid.Program()
    start_program = paddle.fluid.Program()
    with paddle.static.program_guard(main_program, start_program):
        x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
        auto.shard_tensor(x,
                          dist_attr={
                              "process_mesh": auto.ProcessMesh([0, 1]),
                              "dims_mapping": [0, -1, -1]
                          })
        tmp_0 = x[0]
        tmp_1 = x[:, 0, :]
        tmp_2 = x[:, :, 1]
        tmp_3 = x[:2, :2, :2]
    return main_program, start_program
コード例 #14
0
def make_program_dp2():
    main_program = paddle.fluid.Program()
    start_program = paddle.fluid.Program()
    with paddle.static.program_guard(main_program, start_program):
        x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32')
        x.stop_gradient = False
        auto.shard_tensor(x,
                          dist_attr={
                              "process_mesh": auto.ProcessMesh([0, 1]),
                              "dims_mapping": [0, -1, -1]
                          })
        tmp_0 = paddle.reshape(x, shape=[0, 0, 4, 2])
        tmp_1 = paddle.reshape(tmp_0, shape=[0, 0, 8])
        tmp_2 = tmp_1.reshape((tmp_1.shape[0], tmp_1.shape[1], -1))
    return main_program, start_program
コード例 #15
0
def mlp_forward(input, label, hidden_size):
    if _global_parallel_strategy == "dp":
        auto.shard_tensor(input,
                          dist_attr={
                              "process_mesh": _global_process_mesh,
                              "dims_mapping": [0, -1]
                          })

    mlp = MLPLayer(hidden_size=hidden_size,
                   intermediate_size=4 * hidden_size,
                   initializer_range=0.02)
    predict = mlp(input)
    error_cost = paddle.nn.functional.square_error_cost(predict, label)
    loss = paddle.mean(error_cost)
    return loss
コード例 #16
0
    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
        residual = tgt

        if self.normalize_before:
            tgt = self.norm1(tgt)

        if use_cache is False:
            tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
        else:
            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
                                                    use_cache, cache)
        tgt = residual + self.dropout1(tgt)
        if not self.normalize_before:
            tgt = self.norm1(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm2(tgt)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.linear2.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.linear2.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        # tgt = self.dropout2(
        #     self.linear2(F.gelu(
        #         self.linear1(tgt), approximate=True)))
        tgt = self.linear1(tgt)
        tgt = F.gelu(tgt, approximate=True)
        tgt = self.dropout2(self.linear2(tgt))
        tgt = residual + tgt

        if not self.normalize_before:
            tgt = self.norm2(tgt)

        return tgt if use_cache is False else (tgt, incremental_cache)
コード例 #17
0
    def forward(self, input):
        out = self.norm(input)
        auto.shard_tensor(self.linear0.weight,
                          dist_attr={
                              "process_mesh": _g_process_mesh[0],
                              "dims_mapping": [-1, 0]
                          })
        out = self.linear0(out)
        out = F.gelu(out, approximate=True)
        auto.shard_tensor(self.linear1.weight,
                          dist_attr={
                              "process_mesh": _g_process_mesh[1],
                              "dims_mapping": [0, -1]
                          })
        out = self.linear1(out)

        return out
コード例 #18
0
    def forward(self, input):
        auto.shard_tensor(self.linear0.weight,
                          dist_attr={
                              "process_mesh": PP_MESH_0,
                              "dims_mapping": [-1, 1]
                          })
        auto.shard_tensor(self.linear1.weight,
                          dist_attr={
                              "process_mesh": PP_MESH_1,
                              "dims_mapping": [1, -1]
                          })

        out = self.norm(input)
        out = self.linear0(out)
        out = F.gelu(out, approximate=True)
        out = self.linear1(out)

        return out
コード例 #19
0
    def _set_data_parallel(self, var):
        if self._nranks == 1:
            self._default_strategy = 'serial'
            auto.shard_tensor(var,
                              dist_attr={
                                  "process_mesh": [0],
                                  "dims_mapping":
                                  [-1 for _ in range(len(var.shape))]
                              })
        else:
            self._default_strategy = 'dp'
            auto.shard_tensor(var,
                              dist_attr={
                                  "process_mesh":
                                  list(range(self._nranks)),
                                  "dims_mapping": [0] +
                                  [-1 for _ in range(len(var.shape) - 1)]
                              })

        return var
コード例 #20
0
    def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
        r"""
        Prapares linear projected queries, keys and values for usage of subsequnt
        multiple parallel attention. If `cache` is not None, using cached results
        to reduce redundant calculations.
        """
        q = self.q_proj(query)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(
                self.q_proj.weight,
                dist_attr={
                    "process_mesh": _global_process_mesh,
                    "dims_mapping": [-1, 0]
                })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(
                self.q_proj.weight,
                dist_attr={
                    "process_mesh": _global_process_mesh,
                    "dims_mapping": [-1, 1]
                })

        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

        if isinstance(cache, self.StaticCache):
            # for encoder-decoder attention in inference and has cached
            k, v = cache.k, cache.v
        else:
            k, v = self.compute_kv(key, value)

        if isinstance(cache, self.Cache):
            # for decoder self-attention in inference
            k = tensor.concat([cache.k, k], axis=2)
            v = tensor.concat([cache.v, v], axis=2)
        if use_cache is True:
            cache = self.Cache(k, v)

        return (q, k, v) if use_cache is False else (q, k, v, cache)
コード例 #21
0
    def compute_kv(self, key, value):
        r"""
        Applies linear projection on input keys and values, then splits heads
        (reshape and transpose) to get keys and values from different representation
        subspaces. The results are used as key-values pairs for subsequent multiple
        parallel attention.
        It is part of calculations in multi-head attention, and is provided as
        a method to pre-compute and prefetch these results, thus we can use them
        to construct cache for inference.
        """
        k = self.k_proj(key)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(
                self.k_proj.weight,
                dist_attr={
                    "process_mesh": _global_process_mesh,
                    "dims_mapping": [-1, 0]
                })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(
                self.k_proj.weight,
                dist_attr={
                    "process_mesh": _global_process_mesh,
                    "dims_mapping": [-1, 1]
                })

        v = self.v_proj(value)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(
                self.v_proj.weight,
                dist_attr={
                    "process_mesh": _global_process_mesh,
                    "dims_mapping": [-1, 0]
                })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(
                self.v_proj.weight,
                dist_attr={
                    "process_mesh": _global_process_mesh,
                    "dims_mapping": [-1, 1]
                })

        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
        return k, v
コード例 #22
0
        def create_model(train_program, start_program):
            with paddle.static.program_guard(train_program, start_program):

                MESH_0 = auto.ProcessMesh([0, 1])
                input = paddle.static.data(name='input', shape=[8, 8])
                label = paddle.static.data(name='label', shape=[8, 8])

                weight_attr = paddle.ParamAttr(
                    initializer=nn.initializer.Normal(mean=0.0, std=0.02))
                linear0 = nn.Linear(8, 8, weight_attr)
                linear1 = nn.Linear(8, 8, weight_attr)

                auto.shard_tensor(input,
                                  dist_attr={
                                      "process_mesh": MESH_0,
                                      "dims_mapping": [-1, -1]
                                  })
                auto.shard_tensor(label,
                                  dist_attr={
                                      "process_mesh": MESH_0,
                                      "dims_mapping": [-1, -1]
                                  })

                auto.shard_tensor(linear0.weight,
                                  dist_attr={
                                      "process_mesh": MESH_0,
                                      "dims_mapping": [-1, 0]
                                  })
                auto.shard_tensor(linear1.weight,
                                  dist_attr={
                                      "process_mesh": MESH_0,
                                      "dims_mapping": [0, -1]
                                  })

                linear0_out = linear0(input)
                gelu_out = F.gelu(linear0_out)

                linear1_out = linear1(gelu_out)

                error_cost = paddle.nn.functional.square_error_cost(
                    linear1_out, label)
                loss = paddle.mean(error_cost)
                return train_program, start_program, loss, input, label
コード例 #23
0
 def forward(self,
             input_ids,
             position_ids=None,
             attention_mask=None,
             use_cache=False,
             cache=None):
     self.checkpoints = []
     if position_ids is None:
         past_length = 0
         if cache is not None:
             past_length = paddle.shape(cache[0].k)[-2]
         position_ids = paddle.arange(past_length,
                                      paddle.shape(input_ids)[-1] +
                                      past_length,
                                      dtype='int64')
         position_ids = position_ids.unsqueeze(0)
         position_ids = paddle.fluid.layers.expand_as(
             position_ids, input_ids)
     embedding_output = self.embeddings(input_ids=input_ids,
                                        position_ids=position_ids)
     if _global_parallel_strategy == "pp":
         auto.shard_tensor(input_ids,
                           dist_attr={
                               "process_mesh":
                               PP_MESH_LIST[0],
                               "dims_mapping":
                               [-1 for i in range(len(input_ids.shape))]
                           })
     if _global_parallel_strategy == "dp_pp":
         auto.shard_tensor(
             input_ids,
             dist_attr={
                 "process_mesh":
                 DPPP_MESH_LIST[0],
                 "dims_mapping":
                 [0] + [-1 for i in range(len(input_ids.shape) - 1)]
             })
     if _global_parallel_strategy == "dp_mp_pp":
         auto.shard_tensor(
             input_ids,
             dist_attr={
                 "process_mesh":
                 DPMPPP_MESH_LIST[0],
                 "dims_mapping":
                 [0] + [-1 for i in range(len(input_ids.shape) - 1)]
             })
     encoder_outputs = self.decoder(embedding_output,
                                    memory=None,
                                    tgt_mask=attention_mask,
                                    use_cache=use_cache,
                                    cache=cache)
     self.checkpoints.extend(self.decoder.checkpoints)
     return encoder_outputs
コード例 #24
0
def get_program():
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.semi_auto = True
    # fleet.init(is_collective=True, strategy=dist_strategy)

    train_program = static.Program()
    start_program = static.Program()
    with static.program_guard(train_program, start_program):
        # input
        input = static.data(
            name="input",
            shape=[batch_size, sequence_len, hidden_size],
            dtype='float32')
        label = static.data(
            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
        data_holder = [input, label]
        # dataloader
        dataloader = paddle.io.DataLoader.from_generator(
            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
        dataloader.set_batch_generator(
            batch_generator_creator(), places=paddle.static.cuda_places())
        # data dist_attr
        auto.shard_tensor(
            input,
            dist_attr={
                "process_mesh": _g_process_mesh[0],
                "dims_mapping": [0, -1, -1]
            })
        auto.shard_tensor(
            label,
            dist_attr={
                "process_mesh": _g_process_mesh[0],
                "dims_mapping": [0, -1, -1]
            })

        mlp_start = MLPLayer(
            hidden_size=hidden_size,
            intermediate_size=4 * hidden_size,
            dropout_ratio=0.1,
            initializer_range=0.02)
        pred = mlp_start(input)

        mlp_mid = MLPLayer(
            hidden_size=hidden_size,
            intermediate_size=4 * hidden_size,
            dropout_ratio=0.1,
            initializer_range=0.02)
        pred = mlp_mid(pred)

        mlp_end = MLPLayer(
            hidden_size=hidden_size,
            intermediate_size=4 * hidden_size,
            dropout_ratio=0.1,
            initializer_range=0.02)
        pred = mlp_end(pred)

        error_cost = paddle.nn.functional.square_error_cost(pred, label)
        loss = paddle.mean(error_cost)

        optimizer = paddle.optimizer.Adam(
            learning_rate=0.00001,
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08,
            grad_clip=None)

        feed_vars = {"inputs": [input], "labels": [label]}
        fetch_vars = {"loss": [loss]}

    return train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars
コード例 #25
0
 def forward(self,
             query,
             key,
             value,
             attn_mask=None,
             use_cache=False,
             cache=None):
     """
     Applies multi-head attention to map queries and a set of key-value pairs
     to outputs.
     """
     key = query if key is None else key
     value = query if value is None else value
     # compute q ,k ,v
     if use_cache is False:
         if self.fuse:
             q, k, v = self._fuse_prepare_qkv(query)
         else:
             q, k, v = self._prepare_qkv(query, key, value, use_cache,
                                         cache)
     else:
         q, k, v, cache = self._prepare_qkv(query, key, value, use_cache,
                                            cache)
     product = layers.matmul(x=q,
                             y=k,
                             transpose_y=True,
                             alpha=self.head_dim**-0.5)
     if attn_mask is not None:
         product = product + attn_mask
     weights = F.softmax(product)
     if self.dropout:
         weights = F.dropout(weights,
                             self.dropout,
                             training=self.training,
                             mode="upscale_in_train")
     out = tensor.matmul(weights, v)
     # combine heads
     out = tensor.transpose(out, perm=[0, 2, 1, 3])
     out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
     # project to output
     out = self.out_proj(out)
     if _global_parallel_strategy == "mp":
         auto.shard_tensor(self.out_proj.weight,
                           dist_attr={
                               "process_mesh": _global_process_mesh,
                               "dims_mapping": [0, -1]
                           })
     elif _global_parallel_strategy == "dp_mp":
         auto.shard_tensor(self.out_proj.weight,
                           dist_attr={
                               "process_mesh": _global_process_mesh,
                               "dims_mapping": [1, -1]
                           })
     elif _global_parallel_strategy == "mp_pp":
         auto.shard_tensor(self.out_proj.weight,
                           dist_attr={
                               "process_mesh":
                               MPPP_MESH_LIST[self.mesh_idx],
                               "dims_mapping": [0, -1]
                           })
     elif _global_parallel_strategy == "dp_mp_pp":
         auto.shard_tensor(self.out_proj.weight,
                           dist_attr={
                               "process_mesh":
                               DPMPPP_MESH_LIST[self.mesh_idx],
                               "dims_mapping": [1, -1]
                           })
     outs = [out]
     if self.need_weights:
         outs.append(weights)
     if use_cache:
         outs.append(cache)
     return out if len(outs) == 1 else tuple(outs)
コード例 #26
0
    def get_gpt_model(self, strategy, place, batch_size, sequence_len,
                      vocab_size):
        modeling.init_global()
        if strategy == "dp":
            modeling._global_parallel_strategy = "dp"
            modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
        elif strategy == "mp":
            modeling._global_parallel_strategy = "mp"
            modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
        else:
            raise ValueError("'get_gpt_model' only support dp and mp.")

        tokens = paddle.static.data(name="tokens",
                                    shape=[batch_size, sequence_len],
                                    dtype='int64')
        position_ids = paddle.static.data(name="position_ids",
                                          shape=[batch_size, sequence_len],
                                          dtype='int64')
        attention_mask = paddle.static.data(
            name="attention_mask",
            shape=[batch_size, 1, sequence_len, sequence_len],
            dtype='float32')
        labels = paddle.static.data(name="labels",
                                    shape=[batch_size, sequence_len],
                                    dtype='int64')
        loss_mask = paddle.static.data(name="loss_mask",
                                       shape=[batch_size, sequence_len],
                                       dtype='float32')
        data_holder = [tokens, position_ids, attention_mask, labels, loss_mask]

        if modeling._global_parallel_strategy == "dp":
            auto.shard_tensor(tokens,
                              dist_attr={
                                  "process_mesh":
                                  modeling._global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif modeling._global_parallel_strategy == "pp":
            auto.shard_tensor(tokens,
                              dist_attr={
                                  "process_mesh": modeling.PP_MESH_LIST[0],
                                  "dims_mapping": [-1, -1]
                              })
            auto.shard_tensor(attention_mask,
                              dist_attr={
                                  "process_mesh": modeling.PP_MESH_LIST[0],
                                  "dims_mapping": [-1, -1, -1, -1]
                              })

        gpt = GPTModel(vocab_size=1000,
                       hidden_size=64,
                       num_hidden_layers=2,
                       num_attention_heads=8,
                       intermediate_size=256,
                       hidden_act="gelu",
                       hidden_dropout_prob=0.0,
                       attention_probs_dropout_prob=0.0,
                       max_position_embeddings=1024,
                       type_vocab_size=1,
                       initializer_range=0.02,
                       pad_token_id=0,
                       eos_token_id=7,
                       bos_token_id=0,
                       eol_token_id=3)

        model = GPTForPretraining(gpt,
                                  vocab_size=1000,
                                  hidden_size=64,
                                  initializer_range=0.02)
        preds = model(tokens, position_ids, attention_mask)
        criterion = GPTPretrainingCriterion()
        loss = criterion(preds, labels, loss_mask)
        clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
        optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
                                                         beta1=0.9,
                                                         beta2=0.999,
                                                         epsilon=1e-08,
                                                         grad_clip=clip)
        optimizer = fleet.distributed_optimizer(optimizer)
        startup_program = paddle.static.default_startup_program()
        _, _, dist_startup_prog, dist_main_prog = optimizer.minimize(
            loss, startup_program)

        def gen_data():
            np.random.seed(2021)
            for _ in range(10):
                tokens = []
                position_ids = []
                attention_mask = []
                labels = []
                loss_mask = []
                for _ in range(batch_size):
                    tokens.append(
                        np.random.randint(vocab_size, size=sequence_len))
                    position_ids.append(np.arange(sequence_len))
                    attention_mask.append([np.tril(np.ones(sequence_len))])
                    labels.append(
                        np.random.randint(vocab_size, size=sequence_len))
                    loss_mask.append(np.ones(sequence_len))

                yield tokens, position_ids, attention_mask, labels, loss_mask

        return dist_main_prog, dist_startup_prog, data_holder, [loss], gen_data
コード例 #27
0
    def forward(self, input_ids, position_ids):
        if _global_parallel_strategy == "dp":
            auto.shard_tensor(input_ids,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(input_ids,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })

        input_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.word_embeddings.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.word_embeddings.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        embeddings = input_embeddings + position_embeddings
        embeddings = self.dropout1(embeddings)

        # Pre-norm
        target = self.norm1(embeddings)

        # The following is the attention part
        q = self.q_proj(target)
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

        k = self.k_proj(target)
        v = self.v_proj(target)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })

        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])

        # scale dot product attention
        product = layers.matmul(x=q,
                                y=k,
                                transpose_y=True,
                                alpha=self.head_dim**-0.5)

        if self.attn_mask is not None:
            product = product + self.attn_mask

        weights = F.softmax(product)

        if self.dropout_ratio:
            weights = F.dropout(weights,
                                self.dropout_ratio,
                                training=self.training,
                                mode="upscale_in_train")

        out = tensor.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.out_proj(out)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        # Add residual
        residual = embeddings + self.dropout2(out)

        # Pre-norm
        out0 = self.norm2(residual)

        # The following is the MLP part
        out1 = self.linear0(out0)
        out2 = F.gelu(out1, approximate=True)
        out3 = self.linear1(out2)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        # Add residual
        final = residual + self.dropout3(out3)
        return final
コード例 #28
0
 def forward(self,
             tgt,
             memory,
             tgt_mask=None,
             memory_mask=None,
             use_cache=False,
             cache=None):
     """
     Applies a stack of N Transformer decoder layers on inputs. If `norm` is
     provided, also applies layer normalization on the output of last decoder
     layer.
     """
     output = tgt
     new_caches = []
     self.checkpoints = []
     if _global_parallel_strategy == "pp":
         auto.shard_tensor(output,
                           dist_attr={
                               "process_mesh":
                               PP_MESH_LIST[0],
                               "dims_mapping":
                               [-1 for i in range(len(output.shape))]
                           })
     if _global_parallel_strategy == "dp_pp":
         auto.shard_tensor(output,
                           dist_attr={
                               "process_mesh":
                               DPPP_MESH_LIST[0],
                               "dims_mapping": [0] +
                               [-1 for i in range(len(output.shape) - 1)]
                           })
     if _global_parallel_strategy == "mp_pp":
         auto.shard_tensor(output,
                           dist_attr={
                               "process_mesh":
                               MPPP_MESH_LIST[0],
                               "dims_mapping": [-1] +
                               [-1 for i in range(len(output.shape) - 1)]
                           })
     if _global_parallel_strategy == "dp_mp_pp":
         auto.shard_tensor(output,
                           dist_attr={
                               "process_mesh":
                               DPMPPP_MESH_LIST[0],
                               "dims_mapping": [0] +
                               [-1 for i in range(len(output.shape) - 1)]
                           })
     for i, mod in enumerate(self.layers):
         if cache is None:
             if use_cache:
                 if _global_parallel_strategy == "pp":
                     output, new_cache = auto.shard_op(
                         mod,
                         dist_attr={
                             "process_mesh": PP_MESH_LIST[mod.mesh_idx]
                         })(output, memory, tgt_mask, use_cache, cache)[0]
                     auto.shard_tensor(
                         output,
                         dist_attr={
                             "process_mesh":
                             PP_MESH_LIST[mod.mesh_idx],
                             "dims_mapping":
                             [-1 for i in range(len(output.shape))]
                         })
                 elif _global_parallel_strategy == "dp_pp":
                     output, new_cache = auto.shard_op(
                         mod,
                         dist_attr={
                             "process_mesh": DPPP_MESH_LIST[mod.mesh_idx]
                         })(output, memory, tgt_mask, use_cache, cache)[0]
                     auto.shard_tensor(
                         output,
                         dist_attr={
                             "process_mesh":
                             DPPP_MESH_LIST[mod.mesh_idx],
                             "dims_mapping": [0] +
                             [-1 for i in range(len(output.shape) - 1)]
                         })
                 elif _global_parallel_strategy == "mp_pp":
                     output, new_cache = auto.shard_op(
                         mod,
                         dist_attr={
                             "process_mesh": MPPP_MESH_LIST[mod.mesh_idx]
                         })(output, memory, tgt_mask, use_cache, cache)[0]
                     auto.shard_tensor(
                         output,
                         dist_attr={
                             "process_mesh":
                             MPPP_MESH_LIST[mod.mesh_idx],
                             "dims_mapping": [-1] +
                             [-1 for i in range(len(output.shape) - 1)]
                         })
                 elif _global_parallel_strategy == "dp_mp_pp":
                     output, new_cache = auto.shard_op(
                         mod,
                         dist_attr={
                             "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx]
                         })(output, memory, tgt_mask, use_cache, cache)[0]
                     auto.shard_tensor(
                         output,
                         dist_attr={
                             "process_mesh":
                             DPMPPP_MESH_LIST[mod.mesh_idx],
                             "dims_mapping": [0] +
                             [-1 for i in range(len(output.shape) - 1)]
                         })
                 else:
                     output, new_cache = mod(output,
                                             memory,
                                             tgt_mask=tgt_mask,
                                             use_cache=use_cache,
                                             cache=cache)
                 new_caches.append(new_cache)
             else:
                 if _global_parallel_strategy == "pp":
                     output = auto.shard_op(mod,
                                            dist_attr={
                                                "process_mesh":
                                                PP_MESH_LIST[mod.mesh_idx]
                                            })(output, memory, tgt_mask,
                                               use_cache, cache)[0]
                     auto.shard_tensor(
                         output,
                         dist_attr={
                             "process_mesh":
                             PP_MESH_LIST[mod.mesh_idx],
                             "dims_mapping":
                             [-1 for i in range(len(output.shape))]
                         })
                 elif _global_parallel_strategy == "dp_pp":
                     output = auto.shard_op(mod,
                                            dist_attr={
                                                "process_mesh":
                                                DPPP_MESH_LIST[mod.mesh_idx]
                                            })(output, memory, tgt_mask,
                                               use_cache, cache)[0]
                     auto.shard_tensor(
                         output,
                         dist_attr={
                             "process_mesh":
                             DPPP_MESH_LIST[mod.mesh_idx],
                             "dims_mapping": [0] +
                             [-1 for i in range(len(output.shape) - 1)]
                         })
                 elif _global_parallel_strategy == "mp_pp":
                     output = auto.shard_op(mod,
                                            dist_attr={
                                                "process_mesh":
                                                MPPP_MESH_LIST[mod.mesh_idx]
                                            })(output, memory, tgt_mask,
                                               use_cache, cache)[0]
                     auto.shard_tensor(
                         output,
                         dist_attr={
                             "process_mesh":
                             MPPP_MESH_LIST[mod.mesh_idx],
                             "dims_mapping": [-1] +
                             [-1 for i in range(len(output.shape) - 1)]
                         })
                 elif _global_parallel_strategy == "dp_mp_pp":
                     output = auto.shard_op(
                         mod,
                         dist_attr={
                             "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx]
                         })(output, memory, tgt_mask, use_cache, cache)[0]
                     auto.shard_tensor(
                         output,
                         dist_attr={
                             "process_mesh":
                             DPMPPP_MESH_LIST[mod.mesh_idx],
                             "dims_mapping": [0] +
                             [-1 for i in range(len(output.shape) - 1)]
                         })
                 else:
                     output = mod(output,
                                  memory,
                                  tgt_mask=tgt_mask,
                                  use_cache=use_cache,
                                  cache=cache)
         else:
             if _global_parallel_strategy == "pp":
                 output, new_cache = auto.shard_op(
                     mod,
                     dist_attr={"process_mesh": PP_MESH_LIST[mod.mesh_idx]
                                })(output, memory, tgt_mask, use_cache,
                                   cache)
                 auto.shard_tensor(
                     output,
                     dist_attr={
                         "process_mesh": PP_MESH_LIST[mod.mesh_idx],
                         "dims_mapping":
                         [-1 for i in range(len(output.shape))]
                     })
             elif _global_parallel_strategy == "dp_pp":
                 output, new_cache = auto.shard_op(
                     mod,
                     dist_attr={
                         "process_mesh": DPPP_MESH_LIST[mod.mesh_idx]
                     })(output, memory, tgt_mask, use_cache, cache)
                 auto.shard_tensor(
                     output,
                     dist_attr={
                         "process_mesh":
                         DPPP_MESH_LIST[mod.mesh_idx],
                         "dims_mapping":
                         [0] + [-1 for i in range(len(output.shape) - 1)]
                     })
             elif _global_parallel_strategy == "mp_pp":
                 output, new_cache = auto.shard_op(
                     mod,
                     dist_attr={
                         "process_mesh": MPPP_MESH_LIST[mod.mesh_idx]
                     })(output, memory, tgt_mask, use_cache, cache)
                 auto.shard_tensor(
                     output,
                     dist_attr={
                         "process_mesh":
                         MPPP_MESH_LIST[mod.mesh_idx],
                         "dims_mapping":
                         [-1] + [-1 for i in range(len(output.shape) - 1)]
                     })
             elif _global_parallel_strategy == "dp_mp_pp":
                 output, new_cache = auto.shard_op(
                     mod,
                     dist_attr={
                         "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx]
                     })(output, memory, tgt_mask, use_cache, cache)
                 auto.shard_tensor(
                     output,
                     dist_attr={
                         "process_mesh":
                         DPMPPP_MESH_LIST[mod.mesh_idx],
                         "dims_mapping":
                         [0] + [-1 for i in range(len(output.shape) - 1)]
                     })
             else:
                 output, new_cache = mod(output,
                                         memory,
                                         tgt_mask=tgt_mask,
                                         use_cache=use_cache,
                                         cache=cache[i])
             new_caches.append(new_cache)
         self.checkpoints.append(output.name)
     if self.norm is not None:
         output = self.norm(output)
     return output if use_cache is False else (output, new_caches)
コード例 #29
0
def get_program():
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.semi_auto = True
    # fleet.init(is_collective=True, strategy=dist_strategy)

    train_program = static.Program()
    start_program = static.Program()
    with fluid.program_guard(train_program, start_program):

        # 循环计数器
        i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
        auto.shard_tensor(i,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1]
                          })

        # 循环次数
        loop_len = fluid.layers.fill_constant(shape=[1],
                                              dtype='int64',
                                              value=epoch_num)
        auto.shard_tensor(loop_len,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1]
                          })

        # input
        input = static.data(name="input",
                            shape=[batch_size, sequence_len, hidden_size],
                            dtype='float32')
        label = static.data(name="label",
                            shape=[batch_size, sequence_len, 1],
                            dtype='float32')

        data_holder = [input, label]
        # dataloader
        dataloader = paddle.io.DataLoader.from_generator(feed_list=data_holder,
                                                         capacity=4 *
                                                         batch_size,
                                                         iterable=False)
        dataloader.set_batch_generator(batch_generator_creator(),
                                       places=paddle.static.cuda_places())
        # data dist_attr
        auto.shard_tensor(input,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, -1]
                          })
        auto.shard_tensor(label,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, -1]
                          })

        # fill constant bsz like
        tmp = paddle.fluid.layers.fill_constant_batch_size_like(
            input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0)
        auto.shard_tensor(tmp,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, 0, -1, -1]
                          })

        # model
        mlp_start = MLPLayer(hidden_size=hidden_size,
                             intermediate_size=4 * hidden_size,
                             dropout_ratio=0.1,
                             initializer_range=0.02)
        pred = mlp_start(input)

        input_array = fluid.layers.array_write(pred, i)
        auto.shard_tensor(input_array,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, -1]
                          })

        cond = fluid.layers.less_than(x=i, y=loop_len)
        auto.shard_tensor(cond,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1]
                          })

        while_op = fluid.layers.While(cond=cond)
        with while_op.block():

            pre_input = fluid.layers.array_read(array=input_array, i=i)
            auto.shard_tensor(pre_input,
                              dist_attr={
                                  "process_mesh": _g_process_mesh,
                                  "dims_mapping": [-1, -1, -1]
                              })

            mlp_while = MLPLayer(hidden_size=hidden_size,
                                 intermediate_size=4 * hidden_size,
                                 dropout_ratio=0.1,
                                 initializer_range=0.02)
            cur_pred = mlp_while(pre_input)

            # 更新循环条件
            i = fluid.layers.increment(x=i, value=1, in_place=True)
            fluid.layers.array_write(cur_pred, array=input_array, i=i)
            fluid.layers.less_than(x=i, y=loop_len, cond=cond)

        end_pred = fluid.layers.array_read(array=input_array, i=i)
        auto.shard_tensor(end_pred,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, -1]
                          })

        mlp_end = MLPLayer(hidden_size=hidden_size,
                           intermediate_size=4 * hidden_size,
                           dropout_ratio=0.1,
                           initializer_range=0.02)
        pred = mlp_end(end_pred)

        error_cost = paddle.nn.functional.square_error_cost(pred, label)
        auto.shard_tensor(error_cost,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, -1]
                          })

        loss = paddle.mean(error_cost)
        auto.shard_tensor(loss,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1]
                          })

    return train_program, start_program, dataloader, i, loss
コード例 #30
0
    def forward(self, input):
        if _global_parallel_strategy == "dp":
            auto.shard_tensor(input,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(input,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1, -1]
                              })

        q = self.q_proj(input)
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

        k = self.k_proj(input)
        v = self.v_proj(input)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })

        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])

        # scale dot product attention
        product = layers.matmul(x=q,
                                y=k,
                                transpose_y=True,
                                alpha=self.head_dim**-0.5)

        if self.attn_mask is not None:
            product = product + self.attn_mask

        weights = F.softmax(product)

        if self.dropout_ratio:
            weights = F.dropout(weights,
                                self.dropout_ratio,
                                training=self.training,
                                mode="upscale_in_train")

        out = tensor.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.out_proj(out)
        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        return out