def __init__(self, num_tokentypes=2, add_binary_head=True, parallel_output=True, pre_process=True, post_process=True): super(BertModel, self).__init__() args = get_args() self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy self.add_binary_head = add_binary_head self.parallel_output = parallel_output self.pre_process = pre_process self.post_process = post_process init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( num_tokentypes=num_tokentypes, add_pooler=self.add_binary_head, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method, pre_process=self.pre_process, post_process=self.post_process, ) self.initialize_word_embeddings(init_method_normal) if self.post_process: self.lm_head = BertLMHead( self.word_embeddings_weight().size(0), args.hidden_size, init_method, args.layernorm_epsilon, parallel_output, ) self._lm_head_key = 'lm_head' self.binary_head = None if self.add_binary_head: self.binary_head = get_linear_layer(args.hidden_size, 2, init_method) self._binary_head_key = 'binary_head'
def __init__(self, num_tokentypes=0, parallel_output=True): super(T5Model, self).__init__() args = get_args() self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy self.parallel_output = parallel_output init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( num_tokentypes=num_tokentypes, add_pooler=False, add_decoder=True, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method, ) self.lm_head = T5LMHead( self.language_model.embedding.word_embeddings.weight.size(0), parallel_output) self._lm_head_key = 'lm_head'
def __init__( self, vocab_size, hidden_size, max_position_embeddings, num_layers, num_attention_heads, ffn_hidden_size, apply_query_key_layer_scaling=True, kv_channels=None, num_tokentypes=0, parallel_output=True, pre_process=True, post_process=True, init_method_std=0.02, fp16_lm_cross_entropy=False, use_cpu_initialization=False, hidden_dropout=0.1, precision=16, fp32_residual_connection=False, activations_checkpoint_method=None, activations_checkpoint_num_layers=1, layernorm_epsilon=1e-5, bias_gelu_fusion=True, persist_layer_norm=False, openai_gelu=False, onnx_safe=False, ): super(GPTModel, self).__init__() self.parallel_output = parallel_output self.pre_process = pre_process self.post_process = post_process self.fp16_lm_cross_entropy = fp16_lm_cross_entropy if kv_channels is None: assert ( hidden_size % num_attention_heads == 0 ), 'hidden_size must be divisible by num_attention_heads if kv_channels is None' kv_channels = hidden_size // num_attention_heads self.language_model, self._language_model_key = get_language_model( vocab_size=vocab_size, hidden_size=hidden_size, hidden_dropout=hidden_dropout, num_tokentypes=num_tokentypes, max_position_embeddings=max_position_embeddings, num_layers=num_layers, num_attention_heads=num_attention_heads, apply_query_key_layer_scaling=apply_query_key_layer_scaling, kv_channels=kv_channels, ffn_hidden_size=ffn_hidden_size, add_pooler=False, encoder_attn_mask_type=AttnMaskType.causal, init_method=init_method_normal(init_method_std), scaled_init_method=scaled_init_method_normal( init_method_std, num_layers), pre_process=self.pre_process, post_process=self.post_process, init_method_std=init_method_std, use_cpu_initialization=use_cpu_initialization, precision=precision, fp32_residual_connection=fp32_residual_connection, activations_checkpoint_method=activations_checkpoint_method, activations_checkpoint_num_layers=activations_checkpoint_num_layers, layernorm_epsilon=layernorm_epsilon, bias_gelu_fusion=bias_gelu_fusion, persist_layer_norm=persist_layer_norm, openai_gelu=openai_gelu, onnx_safe=onnx_safe, ) self.initialize_word_embeddings( init_method=init_method_normal(init_method_std), vocab_size=vocab_size, hidden_size=hidden_size)
def __init__( self, vocab_size, hidden_size, max_position_embeddings, num_layers, num_attention_heads, ffn_hidden_size, apply_query_key_layer_scaling=True, kv_channels=None, num_tokentypes=0, parallel_output=True, pre_process=True, post_process=True, init_method_std=0.02, fp16_lm_cross_entropy=False, use_cpu_initialization=False, hidden_dropout=0.1, precision=16, fp32_residual_connection=False, activations_checkpoint_method=None, activations_checkpoint_num_layers=1, layernorm_epsilon=1e-5, bias_gelu_fusion=True, openai_gelu=False, onnx_safe=False, add_binary_head=True, ): super(BertModel, self).__init__() # args = get_args() self.fp16_lm_cross_entropy = fp16_lm_cross_entropy self.add_binary_head = add_binary_head self.parallel_output = parallel_output self.pre_process = pre_process self.post_process = post_process init_method = init_method_normal(init_method_std) scaled_init_method = scaled_init_method_normal(init_method_std, num_layers) self.language_model, self._language_model_key = get_language_model( vocab_size=vocab_size, hidden_size=hidden_size, hidden_dropout=hidden_dropout, num_tokentypes=num_tokentypes, max_position_embeddings=max_position_embeddings, num_layers=num_layers, num_attention_heads=num_attention_heads, apply_query_key_layer_scaling=apply_query_key_layer_scaling, kv_channels=kv_channels, ffn_hidden_size=ffn_hidden_size, add_pooler=self.add_binary_head, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method, pre_process=self.pre_process, post_process=self.post_process, init_method_std=init_method_std, use_cpu_initialization=use_cpu_initialization, precision=precision, fp32_residual_connection=fp32_residual_connection, activations_checkpoint_method=activations_checkpoint_method, activations_checkpoint_num_layers=activations_checkpoint_num_layers, layernorm_epsilon=layernorm_epsilon, bias_gelu_fusion=bias_gelu_fusion, openai_gelu=openai_gelu, onnx_safe=onnx_safe, ) self.initialize_word_embeddings( init_method=init_method_normal(init_method_std), vocab_size=vocab_size, hidden_size=hidden_size) if self.post_process: self.lm_head = BertLMHead( self.word_embeddings_weight().size(0), hidden_size, init_method, layernorm_epsilon, parallel_output, openai_gelu, onnx_safe, ) self._lm_head_key = 'lm_head' self.binary_head = None if self.add_binary_head: self.binary_head = get_linear_layer(hidden_size, 2, init_method) self._binary_head_key = 'binary_head'