def __init__(self, config): super().__init__() self._temperature = config.temperature self._pad_token_id = config.pad_token_id self.layer_norm = nn.LayerNorm(config.embedding_size) self.dense = nn.Linear(config.hidden_size, config.embedding_size) self.activation = get_activation(config.hidden_act)
def forward(self, features, **kwargs): x = features[:, 0, :] # take <s> token (equiv. to [CLS]) x = self.dropout1(x) x = self.dense(x) x = get_activation("gelu")( x ) # although BERT uses tanh here, it seems Electra authors used gelu here x = self.dropout2(x) x = self.out_proj(x) return x
def call_adapter(self, inputs, adapter_weights): """Computes the output of the adapter layers.""" down = F.linear(inputs, weight=adapter_weights.down.weight, bias=adapter_weights.down.bias) middle = get_activation(self.activation_type)(down) output = F.linear(middle, weight=adapter_weights.up.weight, bias=adapter_weights.up.bias) return output
def forward(self, *args, **kwargs): x = self.pretrained_model( *args, **kwargs)[0][:, 0, :] # take <s> token (equiv. to [CLS]) x = self.d1(x) x = self.l1(x) x = self.bn1(x) x = get_activation("gelu")(x) x = self.d2(x) x = self.l2(x) return x
def forward(self, discriminator_hidden_states, attention_mask, labels): hidden_states = self.dense(discriminator_hidden_states) hidden_states = get_activation(self.config.hidden_act)(hidden_states) logits = self.dense_prediction(hidden_states).squeeze_() probs = torch.nn.Sigmoid()(logits) preds = torch.round((logits.sign() + 1) / 2) loss_fct = nn.BCEWithLogitsLoss() loss = loss_fct(logits.view(-1, discriminator_hidden_states.shape[1]), labels.float()) return probs, preds, loss
def __init__( self, web_hidden_size, linear_hidden=1536, dropout=0.1, activation_string="gelu", ): super(LinearClassifier, self).__init__() self.dropout1 = nn.Dropout(dropout) if dropout else nn.Identity() self.linear1 = nn.Linear(web_hidden_size, linear_hidden) self.linear2 = nn.Linear(linear_hidden, 1) # self.sigmoid = nn.Sigmoid() # support older versions of huggingface/transformers if activation_string == "gelu": self.activation = nn.GELU() else: self.activation = (get_activation(activation_string) if activation_string else nn.Identity())
def forward(self, generator_hidden_states): hidden_states = self.dense(generator_hidden_states) hidden_states = get_activation("gelu")(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states
def forward(self, discriminator_hidden_states, attention_mask): hidden_states = self.dense(discriminator_hidden_states) hidden_states = get_activation(self.config.hidden_act)(hidden_states) logits = self.dense_prediction(hidden_states).squeeze() return logits
def test_get_activation(self): get_activation("swish") get_activation("silu") get_activation("relu") get_activation("tanh") get_activation("gelu_new") get_activation("gelu_fast") get_activation("gelu_python") get_activation("quick_gelu") get_activation("mish") get_activation("linear") get_activation("sigmoid") with self.assertRaises(KeyError): get_activation("bogus") with self.assertRaises(KeyError): get_activation(None)
def test_gelu_versions(self): x = torch.tensor([-100, -1, -0.1, 0, 0.1, 1.0, 100]) torch_builtin = get_activation("gelu") self.assertTrue(torch.allclose(gelu_python(x), torch_builtin(x))) self.assertFalse(torch.allclose(gelu_python(x), gelu_new(x)))
def __init__(self, activation_type): super().__init__() self.f = get_activation(activation_type)
def forward(self, discriminator_hidden_states): hidden_states = self.dense(discriminator_hidden_states) hidden_states = get_activation(self.config.hidden_act)(hidden_states) logits = self.dense_prediction(hidden_states) return logits
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None, entity_token_ids=None): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) extended_attention_mask = self.get_extended_attention_mask( attention_mask, input_shape, device) head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds) if hasattr(self, "embeddings_project"): hidden_states = self.embeddings_project(hidden_states) hidden_states = self.encoder(hidden_states, attention_mask=extended_attention_mask, head_mask=head_mask # output_attentions=output_attentions, ) sequence_output = hidden_states[0] batch_size = sequence_output.shape[0] batch_embedding = [] for i in range(batch_size): entity_embedding = sequence_output[i][entity_token_ids[i]] batch_embedding.append(entity_embedding.tolist()) batch_embedding = torch.tensor(batch_embedding).cuda() sequence_output_cls = batch_embedding x = self.dropout(sequence_output_cls) x = self.dense(x) x = get_activation("gelu")( x ) # although BERT uses tanh here, it seems Electra authors used gelu here x = self.dropout(x) x = self.out_proj(x) return x
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None, used_entity_token=False, masked_entities_list=None, chemical_code_list=None, disease_code_list=None, is_full_sample=False, label_length=0, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) extended_attention_mask = self.get_extended_attention_mask( attention_mask, input_shape, device) head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds) if hasattr(self, "embeddings_project"): hidden_states = self.embeddings_project(hidden_states) hidden_states = self.encoder( hidden_states, attention_mask=extended_attention_mask, head_mask=head_mask, # output_attentions=output_attentions, ) sequence_output = hidden_states[0] batch_size = chemical_code_list.shape[0] def get_entity_embedding(token_embedding, masked_entities, code): embedding = None for i, mask in enumerate(masked_entities): if mask == code: embedding = token_embedding[i] break return embedding def get_all_entity_embedding(token_embedding, masked_entities, code): embedding_size = list(token_embedding.size())[-1] embedding = [] current_idx = 0 for i, mask in enumerate(masked_entities): if mask == code: if i != current_idx - 1: #get first embedding embedding.append(token_embedding[i]) current_idx = i if len(embedding) == 0: embedding = [torch.zeros(embedding_size)] if torch.cuda.is_available(): embedding = torch.stack(embedding).cuda() else: embedding = torch.stack(embedding) return embedding def generate_code_pairs_list(chemical_code_list_encoded, disease_code_list_encoded, label_len): chemical_codes = [] disease_codes = [] chemical_code_size = list(chemical_code_list_encoded.size()) disease_code_size = list(disease_code_list_encoded.size()) tensor_size = chemical_code_size[0] * disease_code_size[0] for i in range(chemical_code_size[0]): if chemical_code_list_encoded[i] == -1: break for j in range(disease_code_size[0]): if disease_code_list_encoded[j] == -1: break chemical_codes.append(chemical_code_list_encoded[i]) disease_codes.append(disease_code_list_encoded[j]) for i in range(len(chemical_codes), label_len): chemical_codes.append(-1) disease_codes.append(-1) return chemical_codes, disease_codes # def get_entity_embedding(token_embedding, masked_entities, code): # count = 0 # embedding = torch.zeros(token_embedding.shape[1]).cuda() # check = True # for i, mask in enumerate(masked_entities): # if mask == code: # if check: # count += 1 # check = False # embedding += token_embedding[i] # else: # check = True # embedding = embedding / count # return embedding # def get_entity_embedding_use_e_token(token_embedding, masked_entities, code): # embedding = None # for i, mask in enumerate(masked_entities): # if mask == code: # embedding = token_embedding[i] # return embedding batch_embedding = [] if not is_full_sample: for i in range(batch_size): masked_entities = masked_entities_list[i] chemical_code = chemical_code_list[i] disease_code = disease_code_list[i] token_embedding = sequence_output[i] chemical_embedding = get_entity_embedding( token_embedding, masked_entities, chemical_code) disease_embedding = get_entity_embedding( token_embedding, masked_entities, disease_code) # print('chemical_embedding shape: ', chemical_embedding.shape) # print('disease_embedding shape: ', disease_embedding.shape) entity_embedding = torch.cat( (chemical_embedding, disease_embedding), 0) # print(entity_embedding.shape) batch_embedding.append(entity_embedding.tolist()) batch_embedding = torch.tensor(batch_embedding).cuda() sequence_output_cls = batch_embedding x = self.dropout(sequence_output_cls) x = self.dense(x) x = get_activation("gelu")( x ) # although BERT uses tanh here, it seems Electra authors used gelu here x = self.dropout(x) x = self.out_proj(x) return x else: batch_embedding = [] for i in range(batch_size): masked_entities = masked_entities_list[i] chemical_codes, disease_codes = generate_code_pairs_list( chemical_code_list[i], disease_code_list[i], label_length) token_embedding = sequence_output[i] current_output = [] for j in range(len(chemical_codes)): chemical_embeddings = get_all_entity_embedding( token_embedding, masked_entities, chemical_codes[j]) disease_embeddings = get_all_entity_embedding( token_embedding, masked_entities, disease_codes[j]) chemical_embedding = torch.mean(chemical_embeddings, dim=0) disease_embedding = torch.mean(disease_embeddings, dim=0) r_rep = torch.cat([chemical_embedding, disease_embedding], 0) current_output.append(r_rep) current_output_stacked = torch.stack(current_output).unsqueeze( 0) batch_embedding.append(current_output_stacked) batch_embedding = torch.cat(batch_embedding, 0) sequence_output_cls = batch_embedding x = self.dropout(sequence_output_cls) x = self.dense(x) x = get_activation("gelu")( x ) # although BERT uses tanh here, it seems Electra authors used gelu here x = self.dropout(x) x = self.out_proj(x) return x
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None, used_entity_token=False, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import ElectraModel, ElectraTokenizer import torch tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = ElectraModel.from_pretrained('google/electra-small-discriminator') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) extended_attention_mask = self.get_extended_attention_mask( attention_mask, input_shape, device) head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds) if hasattr(self, "embeddings_project"): hidden_states = self.embeddings_project(hidden_states) hidden_states = self.encoder( hidden_states, attention_mask=extended_attention_mask, head_mask=head_mask, # output_attentions=output_attentions, ) sequence_output = hidden_states[0] sequence_output_cls = sequence_output[:, 0, :] x = self.dropout(sequence_output_cls) x = self.dense(x) x = get_activation("gelu")( x ) # although BERT uses tanh here, it seems Electra authors used gelu here x = self.dropout(x) x = self.out_proj(x) return x
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None, used_entity_token=True, masked_entities_list=None, chemical_code_list=None, disease_code_list=None, other_code_list=None): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) extended_attention_mask = self.get_extended_attention_mask( attention_mask, input_shape, device) head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds) if hasattr(self, "embeddings_project"): hidden_states = self.embeddings_project(hidden_states) hidden_states = self.encoder( hidden_states, attention_mask=extended_attention_mask, head_mask=head_mask, # output_attentions=output_attentions, ) batch_size = chemical_code_list.shape[0] token_embedding_output = hidden_states[0] def get_entity_embedding(token_embedding, masked_entities, code): count = 0 embedding = None check = True for i, mask in enumerate(masked_entities): if mask == code: if check: count += 1 check = False if embedding == None: embedding = token_embedding[i] else: embedding += token_embedding[i] # print('embedding shape: ', embedding.shape) else: check = True embedding = embedding / count return embedding batch_embedding = [] if not used_entity_token: for i in range(batch_size): masked_entities = masked_entities_list[i] chemical_code = chemical_code_list[i] disease_code = disease_code_list[i] other_code = other_code_list[i] token_embedding = token_embedding_output[i] if chemical_code == -1: other_embedding = get_entity_embedding( token_embedding, masked_entities, other_code) disease_embedding = get_entity_embedding( token_embedding, masked_entities, disease_code) entity_embedding = torch.cat( (disease_embedding, other_embedding), 0) elif disease_code == -1: chemical_embedding = get_entity_embedding( token_embedding, masked_entities, chemical_code) other_embedding = get_entity_embedding( token_embedding, masked_entities, other_code) entity_embedding = torch.cat( (chemical_embedding, other_embedding), 0) elif other_code == -1: chemical_embedding = get_entity_embedding( token_embedding, masked_entities, chemical_code) disease_embedding = get_entity_embedding( token_embedding, masked_entities, disease_code) entity_embedding = torch.cat( (chemical_embedding, disease_embedding), 0) batch_embedding.append(entity_embedding.tolist()) batch_embedding = torch.tensor(batch_embedding).cuda() sequence_output_cls = batch_embedding x = self.dropout(sequence_output_cls) x = self.dense(x) x = get_activation("tanh")(x) x = self.dropout(x) x = self.out_proj(x) return x
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None, used_entity_token=False, masked_entities_list=None, chemical_code_list=None, disease_code_list=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import ElectraModel, ElectraTokenizer import torch tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') model = ElectraModel.from_pretrained('google/electra-small-discriminator') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) extended_attention_mask = self.get_extended_attention_mask( attention_mask, input_shape, device) head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds) if hasattr(self, "embeddings_project"): hidden_states = self.embeddings_project(hidden_states) hidden_states = self.encoder( hidden_states, attention_mask=extended_attention_mask, head_mask=head_mask, # output_attentions=output_attentions, ) batch_size = chemical_code_list.shape[0] token_embedding_output = hidden_states[0] def get_entity_embedding(token_embedding, masked_entities, code): count = 0 embedding = torch.zeros(token_embedding.shape[1]).cuda() check = True for i, mask in enumerate(masked_entities): if mask == code: if check: count += 1 check = False embedding += token_embedding[i] else: check = True embedding = embedding / count return embedding # def get_entity_token_embedding(token_embedding, masked_entities, code): # count = 0 # embedding = torch.zeros(token_embedding.shape[1]).cuda() # check = True # for i, mask in enumerate(masked_entities): # if mask == code and check: # count += 1 # embedding += token_embedding[i] # else: # if # # embedding = embedding / count # return embedding batch_embedding = [] if not used_entity_token: for i in range(batch_size): masked_entities = masked_entities_list[i] chemical_code = chemical_code_list[i] disease_code = disease_code_list[i] token_embedding = token_embedding_output[i] chemical_embedding = get_entity_embedding( token_embedding, masked_entities, chemical_code) disease_embedding = get_entity_embedding( token_embedding, masked_entities, disease_code) # print('chemical_embedding shape: ', chemical_embedding.shape) # print('disease_embedding shape: ', disease_embedding.shape) entity_embedding = torch.cat( (chemical_embedding, disease_embedding), 0) # print(entity_embedding.shape) batch_embedding.append(entity_embedding.tolist()) # else: # for i in range(batch_size): # masked_entities = masked_entities_list[i] # chemical_code = chemical_code_list[i] # disease_code = disease_code_list[i] # token_embedding = token_embedding_output[i] # chemical_embedding = get_entity_embedding(token_embedding, masked_entities, chemical_code) # disease_embedding = get_entity_embedding(token_embedding, masked_entities, disease_code) # # print('chemical_embedding shape: ', chemical_embedding.shape) # # print('disease_embedding shape: ', disease_embedding.shape) # entity_embedding = torch.cat((chemical_embedding, disease_embedding), 0) # # print(entity_embedding.shape) # batch_embedding.append(entity_embedding.tolist()) batch_embedding = torch.tensor(batch_embedding).cuda() # print('batch_embedding shape: ', batch_embedding.shape) sequence_output_cls = batch_embedding x = self.dropout(sequence_output_cls) x = self.dense(x) x = get_activation("gelu")( x ) # although BERT uses tanh here, it seems Electra authors used gelu here x = self.dropout(x) x = self.out_proj(x) return x
def test_get_activation(self): get_activation("swish") get_activation("relu") get_activation("tanh") get_activation("gelu_new") get_activation("gelu_fast") with self.assertRaises(KeyError): get_activation("bogus") with self.assertRaises(KeyError): get_activation(None)
def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = get_activation(config.hidden_act) self.dense_prediction = nn.Linear(config.hidden_size, 1)