def __init__(self, num_tokentypes=2, add_binary_head=True, parallel_output=True): super(BertModelBase, self).__init__() args = get_args() self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy self.add_binary_head = add_binary_head self.parallel_output = parallel_output init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=self.add_binary_head, init_method=init_method, scaled_init_method=scaled_init_method) self.initialize_word_embeddings(init_method_normal) if mpu.is_pipeline_last_stage(): self.lm_head = BertLMHead( self.word_embeddings_weight().size(0), args.hidden_size, init_method, args.layernorm_epsilon, parallel_output) self._lm_head_key = 'lm_head' self.binary_head = None if self.add_binary_head: self.binary_head = get_linear_layer(args.hidden_size, 2, init_method) self._binary_head_key = 'binary_head'
def __init__(self, num_tokentypes=2, add_binary_head=True, parallel_output=True): super(BertModel, self).__init__() args = get_args() self.add_binary_head = add_binary_head self.parallel_output = parallel_output init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=self.add_binary_head, init_method=init_method, scaled_init_method=scaled_init_method) self.lm_head = BertLMHead( self.language_model.embedding.word_embeddings.weight.size(0), args.hidden_size, init_method, args.layernorm_epsilon, parallel_output) self._lm_head_key = 'lm_head' if self.add_binary_head: self.binary_head = get_linear_layer(args.hidden_size, 2, init_method) self._binary_head_key = 'binary_head'
def __init__(self, num_tokentypes=2, parallel_output=True, pre_process=True, post_process=True): super(PretrainedBertModel, self).__init__() args = get_args() tokenizer = get_tokenizer() self.pad_id = tokenizer.pad self.biencoder_projection_dim = args.biencoder_projection_dim self.parallel_output = parallel_output self.pre_process = pre_process self.post_process = post_process init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( num_tokentypes=num_tokentypes, add_pooler=False, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method, pre_process=self.pre_process, post_process=self.post_process) if args.biencoder_projection_dim > 0: self.projection_enc = get_linear_layer( args.hidden_size, args.biencoder_projection_dim, init_method) self._projection_enc_key = 'projection_enc'
def __init__(self, num_classes, finetune=False): super(VitModel, self).__init__() args = get_args() self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy if args.init_method_xavier_uniform: self.init_method = torch.nn.init.xavier_uniform_ self.scaled_init_method = torch.nn.init.xavier_uniform_ else: self.init_method = init_method_normal(args.init_method_std) self.scaled_init_method = scaled_init_method_normal( args.init_method_std, args.num_layers) self.hidden_size = args.hidden_size self.num_classes = num_classes self.patch_dim = args.patch_dim self.img_dim = args.img_dim self.finetune = finetune assert self.img_dim % self.patch_dim == 0 self.num_patches_per_dim = self.img_dim // self.patch_dim self.num_patches = self.num_patches_per_dim**2 self.seq_length = self.num_patches + 1 self.flatten_dim = self.patch_dim * self.patch_dim * args.num_channels # cls_token self.cls_token = torch.nn.Parameter(torch.randn( 1, 1, self.hidden_size)) torch.nn.init.zeros_(self.cls_token) # Linear encoder self.linear_encoder = torch.nn.Linear(self.flatten_dim, self.hidden_size) # embedding self.position_embeddings = torch.nn.Embedding(self.seq_length, self.hidden_size) init_method_normal(args.init_method_std)( self.position_embeddings.weight) self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda() self.position_embeddings._register_load_state_dict_pre_hook( twod_interpolate_position_embeddings_hook) self.embedding_dropout = torch.nn.Dropout(args.hidden_dropout) # Transformer self.transformer = ParallelTransformer(self.init_method, self.scaled_init_method) # MLP head if not self.finetune: self.mlp_head = VitMlpHead(self.hidden_size, self.num_classes) else: self.class_head = get_linear_layer(self.hidden_size, num_classes, torch.nn.init.zeros_)
def __init__(self, mpu_vocab_size, hidden_size, init_method, layernorm_epsilon, parallel_output): super(BertLMHead, self).__init__() args = get_args() self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size)) self.bias.model_parallel = True self.bias.partition_dim = 0 self.bias.stride = 1 self.parallel_output = parallel_output self.dense = get_linear_layer(hidden_size, hidden_size, init_method) self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) self.gelu = torch.nn.functional.gelu if args.openai_gelu: self.gelu = openai_gelu
def __init__(self, mpu_vocab_size, hidden_size, init_method, layernorm_epsilon, parallel_output): super(BertLMHead, self).__init__() args = get_args() self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size)) mpu.set_tensor_model_parallel_attributes(self.bias, True, 0, 1) self.parallel_output = parallel_output self.dense = get_linear_layer(hidden_size, hidden_size, init_method) self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) self.gelu = torch.nn.functional.gelu if args.openai_gelu: self.gelu = openai_gelu elif args.onnx_safe: self.gelu = erf_gelu
def __init__(self, num_tokentypes=2): super(MultipleChoice, self).__init__() args = get_args() init_method = init_method_normal(args.init_method_std) self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=True, init_method=init_method, scaled_init_method=scaled_init_method_normal( args.init_method_std, args.num_layers)) # Multi-choice head. self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout) self.multichoice_head = get_linear_layer(args.hidden_size, 1, init_method) self._multichoice_head_key = 'multichoice_head'
def __init__(self, ict_head_size, num_tokentypes=2, parallel_output=True): super(IREncoderBertModel, self).__init__() args = get_args() self.ict_head_size = ict_head_size self.parallel_output = parallel_output init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=True, init_method=init_method, scaled_init_method=scaled_init_method) self.ict_head = get_linear_layer(args.hidden_size, ict_head_size, init_method) self._ict_head_key = 'ict_head'
def __init__(self, num_classes, num_tokentypes=2): super(ClassificationBase, self).__init__(share_word_embeddings=False) args = get_args() self.num_classes = num_classes init_method = init_method_normal(args.init_method_std) self.language_model, self._language_model_key = get_language_model( attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=True, init_method=init_method, scaled_init_method=scaled_init_method_normal( args.init_method_std, args.num_layers)) # Multi-choice head. if mpu.is_pipeline_last_stage(): self.classification_dropout = torch.nn.Dropout(args.hidden_dropout) self.classification_head = get_linear_layer( args.hidden_size, self.num_classes, init_method) self._classification_head_key = 'classification_head'
def __init__(self, num_tokentypes=2, add_binary_head=True, parallel_output=True, pre_process=True, post_process=True): super(BertModel, self).__init__() args = get_args() self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy self.add_binary_head = add_binary_head self.parallel_output = parallel_output self.pre_process = pre_process self.post_process = post_process init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) self.language_model, self._language_model_key = get_language_model( num_tokentypes=num_tokentypes, add_pooler=self.add_binary_head, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method, pre_process=self.pre_process, post_process=self.post_process) self.initialize_word_embeddings(init_method_normal) if self.post_process: self.lm_head = BertLMHead( self.word_embeddings_weight().size(0), args.hidden_size, init_method, args.layernorm_epsilon, parallel_output) self._lm_head_key = 'lm_head' self.binary_head = None if self.add_binary_head: self.binary_head = get_linear_layer(args.hidden_size, 2, init_method) self._binary_head_key = 'binary_head'
def __init__(self, num_tokentypes=2, pre_process=True, post_process=True): super(MultipleChoice, self).__init__(share_word_embeddings=False) args = get_args() init_method = init_method_normal(args.init_method_std) self.pre_process = pre_process self.post_process = post_process self.language_model, self._language_model_key = get_language_model( num_tokentypes=num_tokentypes, add_pooler=True, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method_normal( args.init_method_std, args.num_layers), pre_process=self.pre_process, post_process=self.post_process) # Multi-choice head. if self.post_process: self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout) self.multichoice_head = get_linear_layer(args.hidden_size, 1, init_method) self._multichoice_head_key = 'multichoice_head'
def __init__(self, num_classes, finetune=False, pre_process=True, post_process=True): super(VitClassificationModel, self).__init__() args = get_args() self.hidden_size = args.hidden_size self.num_classes = num_classes self.finetune = finetune self.pre_process = pre_process self.post_process = post_process self.backbone = VitBackbone(pre_process=self.pre_process, post_process=self.post_process, single_token_output=True) if self.post_process: if not self.finetune: self.head = VitMlpHead(self.hidden_size, self.num_classes) else: self.head = get_linear_layer(self.hidden_size, self.num_classes, torch.nn.init.zeros_)
def __init__(self, hidden_size, init_method): super(Pooler, self).__init__() self.dense = get_linear_layer(hidden_size, hidden_size, init_method)