Esempio n. 1
0
    def __init__(self,
                 layer_idx,
                 input_size,
                 output_size,
                 init_method,
                 need_gelu=False):
        super().__init__()
        self.need_gelu = need_gelu

        args = get_args()
        self.bias_gelu_fusion = args.bias_gelu_fusion

        # col parallel linear weight sbp: [B, S(1)]
        self.weight = flow.nn.Parameter(
            flow.empty(
                (input_size, output_size),
                dtype=flow.float32,
                placement=dist.get_layer_placement(layer_idx),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast,
                                     flow.sbp.split(1)]),
            ))
        init_method(self.weight)

        # col parallel linear bias sbp: [B, S(0)]
        self.bias = flow.nn.Parameter(
            flow.empty(
                (output_size, ),
                dtype=flow.float32,
                placement=dist.get_layer_placement(layer_idx),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast,
                                     flow.sbp.split(0)]),
            ))
        flow.nn.init.zeros_(self.bias)
Esempio n. 2
0
    def __init__(self, seq_length, hidden_size, vocab_size):
        super().__init__()
        self.seq_length = seq_length
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        args = get_args()
        self.dropout = flow.nn.Dropout(p=args.hidden_dropout)
        self.enable_amp = args.fp16

        # word token embedding shape (vocab_size, hidden_size)
        # sbp: [B, S(0)]
        self.wte = flow.nn.Parameter(
            flow.empty(
                (self.vocab_size, self.hidden_size),
                dtype=flow.float32,
                placement=dist.get_layer_placement(0),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast,
                                     flow.sbp.split(0)]),
            ))

        # word position embedding shape (seq_len, hidden_size)
        # sbp: [B, B]
        self.wpe = flow.nn.Parameter(
            flow.empty(
                (self.seq_length, self.hidden_size),
                dtype=flow.float32,
                placement=dist.get_layer_placement(0),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
            ))

        flow.nn.init.normal_(self.wte, std=args.init_method_std)
        flow.nn.init.normal_(self.wpe, std=args.init_method_std)
Esempio n. 3
0
    def __init__(
        self,
        layer_idx,
        normalized_shape,
        eps=1e-5,
    ):
        super().__init__()
        self.normalized_shape = normalized_shape
        self.epsilon = eps

        self.beta = flow.nn.Parameter(
            flow.empty(
                normalized_shape,
                dtype=flow.float32,
                placement=dist.get_layer_placement(layer_idx),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
            ))
        flow.nn.init.zeros_(self.beta)

        self.gamma = flow.nn.Parameter(
            flow.empty(
                normalized_shape,
                dtype=flow.float32,
                placement=dist.get_layer_placement(layer_idx),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
            ))
        flow.nn.init.ones_(self.gamma)
Esempio n. 4
0
    def __init__(
        self,
        layer_idx,
        input_size,
        output_size,
        init_method,
        dropout_rate,
    ):
        super().__init__()
        self.dropout_rate = dropout_rate

        args = get_args()
        self.bias_dropout_fusion = args.bias_dropout_fusion
        if not self.bias_dropout_fusion:
            self.dropout = flow.nn.Dropout(p=dropout_rate)

        # col parallel linear weight sbp: [B, S(0)]
        self.weight = flow.nn.Parameter(
            flow.empty(
                (input_size, output_size),
                dtype=flow.float32,
                placement=dist.get_layer_placement(layer_idx),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast,
                                     flow.sbp.split(0)]),
            ))
        init_method(self.weight)

        # col parallel linear bias sbp: [B, B]
        self.bias = flow.nn.Parameter(
            flow.empty(
                (output_size, ),
                dtype=flow.float32,
                placement=dist.get_layer_placement(layer_idx),
                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
            ))
        flow.nn.init.zeros_(self.bias)
Esempio n. 5
0
    def __init__(self):
        super().__init__()
        args = get_args()
        assert args.dataset is not None

        batch_size = args.global_batch_size // args.num_accumulation_steps
        self.reader = flow.nn.GPTIndexedBinDataReader(
            data_file_prefix=args.dataset,
            seq_length=args.seq_length,
            num_samples=args.train_samples,
            batch_size=batch_size,
            dtype=flow.int64,
            shuffle=True,
            random_seed=args.seed,
            split_sizes=args.split,
            split_index=0,
            placement=dist.get_layer_placement(0, "cpu"),
            sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast]),
        )
        self.data_decoder = DataDecoder()
        self.label_decoder = LabelDecoder()
Esempio n. 6
0
 def forward(self, x):
     x = x.to_consistent(placement=dist.get_layer_placement(self.layer_idx))
     return flow._C.identity(x)
Esempio n. 7
0
 def forward(self, tokens):
     assert tokens.ndim == 2
     return tokens.to_consistent(placement=dist.get_layer_placement(-1))[:,
                                                                         1:]