def test_inference_no_head_absolute_embedding(self): model = ElectraModel.from_pretrained("google/electra-small-discriminator") input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) output = model(input_ids, attention_mask=attention_mask)[0] expected_shape = torch.Size((1, 11, 256)) self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor( [[[0.4471, 0.6821, -0.3265], [0.4627, 0.5255, -0.3668], [0.4532, 0.3313, -0.4344]]] ) self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
def __init__(self, config): super().__init__(config) self.electra = ElectraModel(config) self.generator_predictions = ElectraGeneratorPredictions(config) self.loss_fct = nn.CrossEntropyLoss( reduction='none') # -100 index = padding token self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) self.init_weights()
def create_and_check_electra_model( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, fake_token_labels, ): model = ElectraModel(config=config) model.to(torch_device) model.eval() (sequence_output, ) = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) (sequence_output, ) = model(input_ids, token_type_ids=token_type_ids) (sequence_output, ) = model(input_ids) result = { "sequence_output": sequence_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size])
def create_and_check_electra_model_as_decoder( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask, ): config.add_cross_attention = True model = ElectraModel(config) model.to(torch_device) model.eval() result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, ) result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) self.parent.assertEqual( result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def __init__(self, extractor, config, *args, **kwargs): super().__init__(*args, **kwargs) self.extractor = extractor self.config = config if config["pretrained"] == "electra-base-msmarco": self.bert = ElectraModel.from_pretrained( "Capreolus/electra-base-msmarco") elif config["pretrained"] == "bert-base-msmarco": self.bert = BertModel.from_pretrained( "Capreolus/bert-base-msmarco") elif config["pretrained"] == "bert-base-uncased": self.bert = BertModel.from_pretrained("bert-base-uncased") else: raise ValueError( f"unsupported model: {config['pretrained']}; need to ensure correct tokenizers will be used before arbitrary hgf models are supported" ) self.transformer_layer_1 = BertLayer(self.bert.config) self.transformer_layer_2 = BertLayer(self.bert.config) self.num_passages = extractor.config["numpassages"] self.maxseqlen = extractor.config["maxseqlen"] self.linear = nn.Linear(self.bert.config.hidden_size, 1) if config["aggregation"] == "max": raise NotImplementedError() elif config["aggregation"] == "avg": raise NotImplementedError() elif config["aggregation"] == "attn": raise NotImplementedError() elif config["aggregation"] == "transformer": self.aggregation = self.aggregate_using_transformer input_embeddings = self.bert.get_input_embeddings() # TODO hardcoded CLS token id cls_token_id = torch.tensor([[101]]) self.initial_cls_embedding = input_embeddings(cls_token_id).view( 1, self.bert.config.hidden_size) self.full_position_embeddings = torch.zeros( (1, self.num_passages + 1, self.bert.config.hidden_size), requires_grad=True, dtype=torch.float) torch.nn.init.normal_(self.full_position_embeddings, mean=0.0, std=0.02) self.initial_cls_embedding = nn.Parameter( self.initial_cls_embedding, requires_grad=True) self.full_position_embeddings = nn.Parameter( self.full_position_embeddings, requires_grad=True) else: raise ValueError( f"unknown aggregation type: {self.config['aggregation']}")
def __init__(self): super(ElectraClassificationHead, self).__init__() electra_base = "google/electra-base-discriminator" electra_large = "google/electra-large-discriminator" self.electra = ElectraModel.from_pretrained(electra_large) self.dense = torch.nn.Linear(self.electra.config.hidden_size, self.electra.config.hidden_size) self.dropout = torch.nn.Dropout( self.electra.config.hidden_dropout_prob) self.out_proj = torch.nn.Linear(self.electra.config.hidden_size, 1) self.gelu = torch.nn.GELU()
def __init__(self, config, ): super(ElectraSpanForNer, self).__init__(config) self.num_labels = config.num_labels self.soft_label = config.soft_label self.BaseModel = ElectraModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.start_fc = PoolerStartLogits(config.hidden_size, self.num_labels) if self.soft_label: self.end_fc = PoolerEndLogits(config.hidden_size + self.num_labels, self.num_labels) else: self.end_fc = PoolerEndLogits(config.hidden_size + 1, self.num_labels) self.init_weights()
def __init__(self, config, model_name, only_embedding=True, output_hidden_states=True): super(ElectraTokenEmbedder, self).__init__(config) self.config = config self.only_embedding = only_embedding self.model = ElectraModel.from_pretrained( model_name, output_hidden_states=output_hidden_states) if self.only_embedding: self.model = self.model.get_input_embeddings() self.model.weight.requires_grad = False
class ElectraEncoder(ElectraPreTrainedModel): def __init__(self, config): super(ElectraEncoder, self).__init__(config) self.electra = ElectraModel(config) self.init_weights() def forward(self, input_ids, attention_mask=None, token_type_ids=None): outputs = self.electra.forward(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) pooled_output = outputs[0][:, 0] # embedding 가져오기 return pooled_output
def __init__(self, config): super().__init__(config) self.hidden_size = config.hidden_size self.electra = ElectraModel(config) self.pooler = nn.Linear(config.hidden_size, config.hidden_size) self.pooler_activation = nn.Tanh() self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.classifier2 = nn.Linear(config.hidden_size, 2) self.gru = GRUWithPadding(config) #self.attention = MultiHeadAttention(config.hidden_size) self.init_weights()
def __init__(self, model_name, config, num_speakers=2): super().__init__(config) self.num_speakers = num_speakers self.electra = ElectraModel.from_pretrained(model_name) self.embeddings = SpeakerAwareElectraEmbeddings(config, self.num_speakers) self.num_labels = config.num_labels self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.out_proj = nn.Linear(config.hidden_size, self.num_labels) self.gelu = nn.GELU() self.init_weights()
def __init__(self, output_size, dropout_rate=0.1, device='cpu'): super().__init__() self.device = device self.dropout = nn.Dropout(p=dropout_rate) self.electra = ElectraModel.from_pretrained( 'google/electra-small-discriminator').to(device) self.cls_query = nn.Parameter( torch.randn(1, self.electra.config.hidden_size, device=device) ) # a learnable vector acting as query for output att self.cls_att = AttentionModule(d_model=self.electra.config.hidden_size, d_k=self.electra.config.hidden_size, device=device, dropout=self.dropout) self.output = nn.Linear(self.electra.config.hidden_size, output_size).to(device)
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.electra = ElectraModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.loss_fct = nn.CrossEntropyLoss() self.use_crf = config.use_crf if self.use_crf: self.crf_layer = Transformer_CRF( num_labels=config.num_labels, start_label_id=config.label2idx['CLS']) else: self.crf_layer = None self.init_weights()
def __init__(self, config: TrainConfig, logger: logging.Logger): super().__init__() self.config = config self.electra: ElectraModel = ElectraModel.from_pretrained( config.pretrained_model_name) self.dense = nn.Linear(self.electra.config.hidden_size, self.electra.config.hidden_size) self.dropout = nn.Dropout(self.electra.config.hidden_dropout_prob) self.bias_classifier = nn.Linear(self.electra.config.hidden_size, 3) self.hate_classifier = nn.Linear(self.electra.config.hidden_size, 3) self.criterion = nn.CrossEntropyLoss() self.learning_rate = config.learning_rate self.stream_logger = logger
def _get_bert(model_type, model_path_dict): if model_type == 'bert': config = BertConfig.from_pretrained(model_path_dict['config']) config.output_hidden_states = True bert = BertModel.from_pretrained(model_path_dict['model'], config=config) elif model_type == 'electra': config = ElectraConfig.from_pretrained(model_path_dict['config']) config.output_hidden_states = True bert = ElectraModel.from_pretrained(model_path_dict['model'], config=config) elif model_type == 'roberta': config = RobertaConfig.from_pretrained(model_path_dict['config']) config.output_hidden_states = True bert = RobertaModel.from_pretrained(model_path_dict['model'], config=config) return bert, config
def __init__(self, config, bidirectional=True): super().__init__(config) self.electra = ElectraModel(config) feature_dim = config.hidden_size if bidirectional: feature_dim += config.hidden_size self.score = nn.Linear(2 * config.hidden_size, 1) self.pooler = nn.Linear(feature_dim, config.hidden_size) self.pooler_activation = nn.Tanh() self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.classifier2 = nn.Linear(config.hidden_size, 2) self.bidirectional = bidirectional self.gru1 = GRUWithPadding(config.hidden_size, bidirectional=bidirectional) self.init_weights() print("bidirectional is: " + str(bidirectional))
def __init__( self, intent_class_num, entity_class_num, default_model_path='monologg/koelectra-small-v2-discriminator', pad_token_id=0): super(KoELECTRAFineTuner, self).__init__() self.intent_class_num = intent_class_num self.entity_class_num = entity_class_num self.backbone = ElectraModel.from_pretrained(default_model_path) self.feature_dim = self.backbone.config.hidden_size self.intent_embedding = nn.Linear(self.feature_dim, self.intent_class_num) self.entity_embedding = nn.Linear(self.feature_dim, self.entity_class_num) self.entity_featurizer = CRF(self.entity_class_num, batch_first=True) self.pad_token_id = pad_token_id
def __init__(self, hidden_dim, pretrained_model): super(UtterancePretrainedModel, self).__init__() self._pretrained_model = pretrained_model if pretrained_model == "bert": self._encoder = BertModel.from_pretrained("bert-base-uncased") elif pretrained_model == "roberta": self._encoder = RobertaModel.from_pretrained("roberta-base") elif pretrained_model == "xlnet": self._encoder = XLNetModel.from_pretrained("xlnet-base-cased") elif pretrained_model == "albert": self._encoder = AlbertModel.from_pretrained("albert-base-v2") elif pretrained_model == "electra": self._encoder = ElectraModel.from_pretrained("google/electra-base-discriminator") else: assert False, "Something wrong with the parameter --pretrained_model" self._linear = nn.Linear(UtterancePretrainedModel.HIDDEN_DIM, hidden_dim)
def __init__(self, configs): super(ElectraClassification, self).__init__() self.configs = configs self.bert_hiddensize = self.configs["bert_hiddensize"] self.dense = self.configs["dense"] self.label_nums = self.configs["label_nums"] self.dropout = self.configs["dropout"] self.electra_model = ElectraModel.from_pretrained(self.configs["path"]["electra_path"]) # for p in self.bert_model.parameters(): # p.requires_grad = True # output shape of bert: (batch_size, seqlens, lstm_hiddensize) self.classification = torch.nn.Sequential( torch.nn.Linear(self.bert_hiddensize, self.dense), torch.nn.ReLU(), torch.nn.Dropout(self.dropout), torch.nn.Linear(self.dense, self.label_nums) )
def __init__( self, backbone: None, vocab_size: int, seq_len: int, intent_class_num: int, entity_class_num: int, d_model=512, nhead=8, num_encoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu", pad_token_id: int = 0, ): super(EmbeddingTransformer, self).__init__() self.backbone = backbone self.seq_len = seq_len self.pad_token_id = pad_token_id if backbone is None: self.encoder = nn.TransformerEncoder( TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation), num_encoder_layers, LayerNorm(d_model), ) else: # pre-defined model architecture use if backbone == "kobert": self.encoder = get_kobert_model() elif backbone == "distill_kobert": self.encoder = get_distilkobert_model() elif backbone == "koelectra": self.encoder = ElectraModel.from_pretrained( "monologg/koelectra-small-v2-discriminator") d_model = self.encoder.config.hidden_size self.embedding = nn.Embedding(vocab_size, d_model) self.position_embedding = nn.Embedding(self.seq_len, d_model) self.intent_feature = nn.Linear(d_model, intent_class_num) self.entity_feature = nn.Linear(d_model, entity_class_num)
def __init__(self, config, need_birnn=False, rnn_dim=128): super(Electra_BiLSTM_CRF, self).__init__(config) self.num_tags = config.num_labels self.electra = ElectraModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) out_dim = config.hidden_size self.need_birnn = need_birnn # 如果为False,则不要BiLSTM层 if need_birnn: self.birnn = nn.LSTM(config.hidden_size, rnn_dim, num_layers=1, bidirectional=True, batch_first=True) out_dim = rnn_dim * 2 self.hidden2tag = nn.Linear(out_dim, config.num_labels) self.crf = CRF(config.num_labels, batch_first=True)
def __get_model_and_tokenizer(self): model, tokenizer = None, None if self.transformer_model == TransformerType.BERT: tokenizer = BertTokenizer.from_pretrained('bert-base-cased') model = BertModel.from_pretrained('bert-base-cased') if self.transformer_model == TransformerType.XLNet: tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = XLNetModel.from_pretrained('xlnet-base-cased') if self.transformer_model == TransformerType.RoBERTa: tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaModel.from_pretrained('roberta-base') if self.transformer_model == TransformerType.ELECTRA: tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = ElectraModel.from_pretrained('google/electra-small-discriminator') return model, tokenizer
def __init__(self, config: dict): super(Model, self).__init__() self.electra_cfg = ElectraConfig() self.electra = ElectraModel.from_pretrained(config["pretrained_dir"] + "electra_small.index", config=self.electra_cfg, from_tf=True) self.sentence_encoder = AttentionSentenceEncoder( self.electra_cfg.hidden_size, config["sent_head"], config["max_sents"] + 1) # 多一个位置给CLS self.img_encoder = SimpleImageEncoder(config["img_input_size"], config["img_output_size"], config["img_num"], dropout=config["dropout"]) self.output_layer = OutputLayer( config["task"], self.electra_cfg.hidden_size + config["img_output_size"], config["output_size"], config["dropout"])
def __init__(self, config, add_GRU=True, bidirectional=False, word_level=False, add_cls=False, word_and_sent=False): super().__init__(config) self.electra = ElectraModel(config) feature_dim = config.hidden_size if bidirectional: feature_dim += config.hidden_size if word_and_sent: feature_dim *= 2 if add_cls: feature_dim += config.hidden_size self.pooler = nn.Linear(feature_dim, config.hidden_size) self.pooler_activation = nn.Tanh() self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.classifier2 = nn.Linear(config.hidden_size, 2) self.add_GRU = add_GRU self.word_level = word_level self.add_cls = add_cls self.bidirectional = bidirectional self.word_and_sent = word_and_sent if self.add_GRU: self.gru = GRUWithPadding(config.hidden_size) #self.gru = nn.GRU(config.hidden_size,config.hidden_size,num_layers=1,batch_first = True, bidirectional=bidirectional) if self.word_and_sent: self.gru2 = nn.GRU(config.hidden_size, config.hidden_size, num_layers=1, batch_first=True, bidirectional=bidirectional) self.init_weights() print("add_GRU is: " + str(add_GRU)) print("bidirectional is: " + str(bidirectional)) print("word_level is:" + str(word_level)) print("add_cls is:" + str(add_cls)) print("word_and_sent is:" + str(word_and_sent))
def __init__(self, config: ElectraConfig, args: Namespace, bias_label_lst=None, hate_label_lst=None): super().__init__(config) self.args = args self.num_bias_labels = len( bias_label_lst) if bias_label_lst is not None else 0 self.num_hate_labels = len( hate_label_lst) if hate_label_lst is not None else 0 self.electra = ElectraModel(config) self.bias_classifier = BiasClassificationHead(config, self.num_bias_labels) self.hate_classifier = HateClassificationHead(config, self.num_hate_labels) self.loss_fct = nn.CrossEntropyLoss() self.init_weights()
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.num_regs = config.num_regs self.electra = ElectraModel(config) self.classifier = nn.Sequential( nn.Dropout(config.hidden_dropout_prob), nn.Linear(config.hidden_size, config.hidden_size), nn.GELU(), nn.Dropout(config.hidden_dropout_prob), nn.Linear(config.hidden_size, config.num_labels), ) self.regressor = nn.Sequential( nn.Dropout(config.hidden_dropout_prob), nn.Linear(config.hidden_size, config.hidden_size), nn.GELU(), nn.Dropout(config.hidden_dropout_prob), nn.Linear(config.hidden_size, config.num_regs), )
def __init__(self, config, num_rnn=1, num_decoupling=1): super().__init__(config) self.electra = ElectraModel(config) self.num_decoupling = num_decoupling self.localMHA = nn.ModuleList( [MHA(config) for _ in range(num_decoupling)]) self.globalMHA = nn.ModuleList( [MHA(config) for _ in range(num_decoupling)]) self.fuse1 = FuseLayer(config) self.fuse2 = FuseLayer(config) self.gru1 = GRUWithPadding(config, num_rnn) self.pooler = nn.Linear(2 * config.hidden_size, config.hidden_size) self.pooler_activation = nn.Tanh() self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.init_weights()
def __init__(self, config, bidirectional=False): super().__init__(config) self.electra = ElectraModel(config) feature_dim = config.hidden_size * 4 if bidirectional: feature_dim *= 2 #Attention Flow Layer self.att_weight_c = nn.Linear(config.hidden_size, 1) self.att_weight_q = nn.Linear(config.hidden_size, 1) self.att_weight_cq = nn.Linear(config.hidden_size, 1) self.pooler = nn.Linear(feature_dim, config.hidden_size) self.pooler_activation = nn.Tanh() self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.classifier2 = nn.Linear(config.hidden_size, 2) self.bidirectional = bidirectional self.gru1 = GRUWithPadding(config.hidden_size * 4, bidirectional=bidirectional) self.init_weights() print("bidirectional is: " + str(bidirectional))
def __init__( self, config, n_layers=2, activation='relu', beta=100, ): super(ElectraForConversationalQuestionAnswering, self).__init__(config) self.electra = ElectraModel(config) hidden_size = config.hidden_size self.rational_l = MultiLinearLayer(n_layers, hidden_size, hidden_size, 1, activation) self.logits_l = MultiLinearLayer(n_layers, hidden_size, hidden_size, 2, activation) self.unk_l = MultiLinearLayer(n_layers, hidden_size, hidden_size, 1, activation) self.attention_l = MultiLinearLayer(n_layers, hidden_size, hidden_size, 1, activation) self.yn_l = MultiLinearLayer(n_layers, hidden_size, hidden_size, 2, activation) self.beta = beta self.init_weights()
class Pronunciation2Spelling(nn.Module): def __init__(self, enc_config, dec_config): super(Pronunciation2Spelling, self).__init__() self.encoders = ElectraModel(enc_config) self.embedding = self.encoders.get_input_embeddings() if enc_config.embedding_size != dec_config.hidden_size: self.embedding_projection = nn.Linear(enc_config.embedding_size, dec_config.hidden_size) self.decoders = Decoders(dec_config) self.dense = nn.Linear(dec_config.hidden_size, dec_config.trg_vocab_size) self.padding_idx = dec_config.padding_idx def forward(self, enc_ids, dec_ids): dec_embeddings = self.embedding(dec_ids) if hasattr(self, 'embedding_projection'): dec_embeddings = self.embedding_projection(dec_embeddings) enc_outputs = self.encoders(enc_ids).last_hidden_state dec_outputs, _, _ = self.decoders(enc_ids, enc_outputs, dec_ids, dec_embeddings) model_output = self.dense(dec_outputs) return model_output