def __init__( self, init_method, output_layer_init_method, hidden_size, ffn_hidden_size, use_cpu_initialization=False, bias_gelu_fusion=True, openai_gelu=False, onnx_safe=False, fused_fp16=False, fused_bf16=False, ): super(ParallelMLP, self).__init__() # Project to 4h. self.dense_h_to_4h = tensor_parallel.ColumnParallelLinear( hidden_size, ffn_hidden_size, gather_output=False, init_method=init_method, skip_bias_add=True, use_cpu_initialization=use_cpu_initialization, ) self.bias_gelu_fusion = bias_gelu_fusion self.activation_func = F.gelu if openai_gelu: self.activation_func = openai_gelu elif onnx_safe: self.activation_func = erf_gelu # Project back to h. self.dense_4h_to_h = tensor_parallel.RowParallelLinear( ffn_hidden_size, hidden_size, input_is_parallel=True, init_method=output_layer_init_method, skip_bias_add=True, use_cpu_initialization=use_cpu_initialization, ) self.bias_gelu_impl = FusedBiasGeLU(fused_fp16, fused_bf16)
def __init__( self, init_method, output_layer_init_method, layer_number, num_attention_heads, hidden_size, attention_type=AttnType.self_attn, attn_mask_type=AttnMaskType.padding, precision=16, apply_query_key_layer_scaling=True, kv_channels=None, use_cpu_initialization=False, masked_softmax_fusion=True, attention_dropout=0.1, ): super(ParallelAttention, self).__init__() self.apply_query_key_layer_scaling = apply_query_key_layer_scaling self.attention_softmax_in_fp32 = False if self.apply_query_key_layer_scaling: self.attention_softmax_in_fp32 = True self.layer_number = max(1, layer_number) self.attention_type = attention_type self.attn_mask_type = attn_mask_type if kv_channels is None: assert ( hidden_size % num_attention_heads == 0 ), 'hidden_size must be divisible by num_attention_heads if kv_channels is None' kv_channels = hidden_size // num_attention_heads projection_size = kv_channels * num_attention_heads # Per attention head and per partition values. world_size = parallel_state.get_tensor_model_parallel_world_size() self.hidden_size_per_partition = safe_divide(projection_size, world_size) self.hidden_size_per_attention_head = safe_divide( projection_size, num_attention_heads) self.num_attention_heads_per_partition = safe_divide( num_attention_heads, world_size) # Strided linear layer. if attention_type == AttnType.self_attn: self.query_key_value = tensor_parallel.ColumnParallelLinear( hidden_size, 3 * projection_size, gather_output=False, init_method=init_method, use_cpu_initialization=use_cpu_initialization, ) else: assert attention_type == AttnType.cross_attn self.query = tensor_parallel.ColumnParallelLinear( hidden_size, projection_size, gather_output=False, init_method=init_method) self.key_value = tensor_parallel.ColumnParallelLinear( hidden_size, 2 * projection_size, gather_output=False, init_method=init_method) coeff = None self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) if self.apply_query_key_layer_scaling: coeff = self.layer_number self.norm_factor *= coeff fused_fp16 = precision == 16 fused_bf16 = precision == 'bf16' self.scale_mask_softmax = FusedScaleMaskSoftmax( fused_fp16, fused_bf16, self.attn_mask_type, masked_softmax_fusion, attention_mask_func, self.attention_softmax_in_fp32, coeff, ) # Dropout. Note that for a single iteration, this layer will generate # different outputs on different number of parallel partitions but # on average it should not be partition dependent. self.attention_dropout = torch.nn.Dropout(attention_dropout) # Output. self.dense = tensor_parallel.RowParallelLinear( projection_size, hidden_size, input_is_parallel=True, init_method=output_layer_init_method, skip_bias_add=True, use_cpu_initialization=use_cpu_initialization, )
def __init__( self, init_method, output_layer_init_method, hidden_size, ffn_hidden_size, use_cpu_initialization=False, bias_gelu_fusion=True, openai_gelu=False, onnx_safe=False, activation='gelu', ): super(ParallelMLP, self).__init__() self.activation = activation if activation not in ['gelu', 'geglu']: raise ValueError( f"Activation {activation} not supported. Only gelu and geglu are supported." ) # Project to 4h. self.dense_h_to_4h = tensor_parallel.ColumnParallelLinear( hidden_size, ffn_hidden_size, # NOTE: When using geglu, divide ffn dim by 2/3 to keep overall params the same. gather_output=False, init_method=init_method, skip_bias_add=True, use_cpu_initialization=use_cpu_initialization, ) if activation == 'geglu': # Separate linear layer for GEGLU activation. # Source: https://github.com/huggingface/transformers/blob/bee361c6f1f7704f8c688895f2f86f6e5ff84727/src/transformers/models/t5/modeling_t5.py#L292 self.dense_h_to_4h_2 = tensor_parallel.ColumnParallelLinear( hidden_size, ffn_hidden_size, # NOTE: When using geglu, divide ffn dim by 2/3 to keep overall params the same. gather_output=False, init_method=init_method, skip_bias_add=True, use_cpu_initialization=use_cpu_initialization, ) self.bias_gelu_fusion = bias_gelu_fusion self.activation_func = F.gelu if activation == 'geglu': self.activation_func = 'geglu' # Implemented using F.gelu if bias_gelu_fusion: logging.warning( "Bias Gelu Fusion is not supported for GEGLU activation. Running with pytorch F.gelu" ) if openai_gelu: self.activation_func = openai_gelu elif onnx_safe: self.activation_func = erf_gelu # Project back to h. self.dense_4h_to_h = tensor_parallel.RowParallelLinear( ffn_hidden_size, hidden_size, input_is_parallel=True, init_method=output_layer_init_method, skip_bias_add=True, use_cpu_initialization=use_cpu_initialization, )