def __init__(self, layer_idx, input_size, output_size, init_method, need_gelu=False): super().__init__() self.need_gelu = need_gelu args = get_args() self.bias_gelu_fusion = args.bias_gelu_fusion # col parallel linear weight sbp: [B, S(1)] self.weight = flow.nn.Parameter( flow.empty( (input_size, output_size), dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)]), )) init_method(self.weight) # col parallel linear bias sbp: [B, S(0)] self.bias = flow.nn.Parameter( flow.empty( (output_size, ), dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]), )) flow.nn.init.zeros_(self.bias)
def __init__(self, seq_length, hidden_size, vocab_size): super().__init__() self.seq_length = seq_length self.hidden_size = hidden_size self.vocab_size = vocab_size args = get_args() self.dropout = flow.nn.Dropout(p=args.hidden_dropout) self.enable_amp = args.fp16 # word token embedding shape (vocab_size, hidden_size) # sbp: [B, S(0)] self.wte = flow.nn.Parameter( flow.empty( (self.vocab_size, self.hidden_size), dtype=flow.float32, placement=dist.get_layer_placement(0), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]), )) # word position embedding shape (seq_len, hidden_size) # sbp: [B, B] self.wpe = flow.nn.Parameter( flow.empty( (self.seq_length, self.hidden_size), dtype=flow.float32, placement=dist.get_layer_placement(0), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), )) flow.nn.init.normal_(self.wte, std=args.init_method_std) flow.nn.init.normal_(self.wpe, std=args.init_method_std)
def __init__( self, layer_idx, normalized_shape, eps=1e-5, ): super().__init__() self.normalized_shape = normalized_shape self.epsilon = eps self.beta = flow.nn.Parameter( flow.empty( normalized_shape, dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), )) flow.nn.init.zeros_(self.beta) self.gamma = flow.nn.Parameter( flow.empty( normalized_shape, dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), )) flow.nn.init.ones_(self.gamma)
def __init__( self, layer_idx, input_size, output_size, init_method, dropout_rate, ): super().__init__() self.dropout_rate = dropout_rate args = get_args() self.bias_dropout_fusion = args.bias_dropout_fusion if not self.bias_dropout_fusion: self.dropout = flow.nn.Dropout(p=dropout_rate) # col parallel linear weight sbp: [B, S(0)] self.weight = flow.nn.Parameter( flow.empty( (input_size, output_size), dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]), )) init_method(self.weight) # col parallel linear bias sbp: [B, B] self.bias = flow.nn.Parameter( flow.empty( (output_size, ), dtype=flow.float32, placement=dist.get_layer_placement(layer_idx), sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), )) flow.nn.init.zeros_(self.bias)
def __init__(self): super().__init__() args = get_args() assert args.dataset is not None batch_size = args.global_batch_size // args.num_accumulation_steps self.reader = flow.nn.GPTIndexedBinDataReader( data_file_prefix=args.dataset, seq_length=args.seq_length, num_samples=args.train_samples, batch_size=batch_size, dtype=flow.int64, shuffle=True, random_seed=args.seed, split_sizes=args.split, split_index=0, placement=dist.get_layer_placement(0, "cpu"), sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast]), ) self.data_decoder = DataDecoder() self.label_decoder = LabelDecoder()
def forward(self, x): x = x.to_consistent(placement=dist.get_layer_placement(self.layer_idx)) return flow._C.identity(x)
def forward(self, tokens): assert tokens.ndim == 2 return tokens.to_consistent(placement=dist.get_layer_placement(-1))[:, 1:]