def test_sanity(self): class TextEmbeddings(TransformerModule, FromParams): def __init__( self, vocab_size: int, hidden_size: int, pad_token_id: int, max_position_embeddings: int, type_vocab_size: int, dropout: float, ): super().__init__() self.word_embeddings = torch.nn.Embedding( vocab_size, hidden_size, padding_idx=pad_token_id ) self.position_embeddings = torch.nn.Embedding(max_position_embeddings, hidden_size) self.token_type_embeddings = torch.nn.Embedding(type_vocab_size, hidden_size) self.layer_norm = torch.nn.LayerNorm(hidden_size, eps=1e-12) self.dropout = torch.nn.Dropout(dropout) def forward( self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None ): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] seq_length = input_shape[1] device = input_ids.device if input_ids is not None else inputs_embeds.device if position_ids is None: position_ids = torch.arange(seq_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).expand(input_shape) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + position_embeddings + token_type_embeddings embeddings = self.layer_norm(embeddings) embeddings = self.dropout(embeddings) return embeddings torch.manual_seed(23) text = TextEmbeddings(10, 5, 2, 3, 7, 0.0) torch.manual_seed(23) transformer = TransformerEmbeddings(10, 5, 2, 3, 7, 0.0) input_ids = torch.tensor([[1, 2]]) token_type_ids = torch.tensor([[1, 0]], dtype=torch.long) position_ids = torch.tensor([[0, 1]]) text_output = text.forward(input_ids, token_type_ids, position_ids) transformer_output = transformer.forward(input_ids, token_type_ids, position_ids) assert_allclose(text_output, transformer_output)
def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( pretrained) self.transformer = TransformerStack.from_pretrained_module( pretrained, num_hidden_layers=4)
def test_no_token_type_layer(self): params = copy.deepcopy(self.params_dict) params["type_vocab_size"] = 0 params = Params(params) module = TransformerEmbeddings.from_params(params) assert len(module.embeddings) == 2
def __init__( self, vocab: Vocabulary, transformer_model: str = "roberta-large", override_weights_file: Optional[str] = None, **kwargs ) -> None: super().__init__(vocab, **kwargs) transformer_kwargs = { "model_name": transformer_model, "weights_path": override_weights_file, } self.embeddings = TransformerEmbeddings.from_pretrained_module(**transformer_kwargs) self.transformer_stack = TransformerStack.from_pretrained_module(**transformer_kwargs) self.pooler = TransformerPooler.from_pretrained_module(**transformer_kwargs) self.pooler_dropout = Dropout(p=0.1) self.linear_layer = torch.nn.Linear(self.pooler.get_output_dim(), 1) self.linear_layer.weight.data.normal_(mean=0.0, std=0.02) self.linear_layer.bias.data.zero_() self.loss = torch.nn.CrossEntropyLoss() from allennlp.training.metrics import CategoricalAccuracy self.accuracy = CategoricalAccuracy()
def from_huggingface_model_name( cls, vocab: Vocabulary, model_name: str, image_feature_dim: int, image_num_hidden_layers: int, image_hidden_size: int, image_num_attention_heads: int, combined_hidden_size: int, combined_num_attention_heads: int, pooled_output_dim: int, image_intermediate_size: int, image_attention_dropout: float, image_hidden_dropout: float, image_biattention_id: List[int], text_biattention_id: List[int], text_fixed_layer: int, image_fixed_layer: int, pooled_dropout: float = 0.1, fusion_method: str = "sum", *, ignore_text: bool = False, ignore_image: bool = False, ): text_embeddings = TransformerEmbeddings.from_pretrained_module( model_name) image_embeddings = ImageFeatureEmbeddings( feature_size=image_feature_dim, embedding_size=image_hidden_size, dropout=image_hidden_dropout, ) encoder = BiModalEncoder.from_pretrained_module( model_name, num_hidden_layers2=image_num_hidden_layers, hidden_size2=image_hidden_size, num_attention_heads2=image_num_attention_heads, combined_hidden_size=combined_hidden_size, combined_num_attention_heads=combined_num_attention_heads, intermediate_size2=image_intermediate_size, attention_dropout2=image_attention_dropout, hidden_dropout2=image_hidden_dropout, biattention_id1=text_biattention_id, biattention_id2=image_biattention_id, fixed_layer1=text_fixed_layer, fixed_layer2=image_fixed_layer, ) return cls( vocab=vocab, text_embeddings=text_embeddings, image_embeddings=image_embeddings, encoder=encoder, pooled_output_dim=pooled_output_dim, fusion_method=fusion_method, dropout=pooled_dropout, ignore_text=ignore_text, ignore_image=ignore_image, )
def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.get_relevant_module( "albert-base-v2") self.transformer = TransformerStack.from_pretrained_module( "bert-base-uncased") # We want to tune only the embeddings, because that's our experiment. self.transformer.requires_grad = False
def setup_method(self): super().setup_method() self.params_dict = {key: val for key, val in PARAMS_DICT.items()} params = Params(copy.deepcopy(self.params_dict)) self.transformer_embeddings = TransformerEmbeddings.from_params(params)
def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( pretrained, relevant_module="bert.embeddings") self.transformer = TransformerStack.from_pretrained_module( pretrained, num_hidden_layers=4, relevant_module="bert.encoder", strict=False, )
def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( "bert-base-uncased") self.separate_transformer = TransformerStack.from_pretrained_module( "bert-base-uncased", num_hidden_layers=range(0, 8)) self.combined_transformer = TransformerStack.from_pretrained_module( "bert-base-uncased", num_hidden_layers=range(8, 12), )
def test_loading_from_pretrained_weights_using_model_name(self, pretrained_name): pretrained_module = cached_transformers.get(pretrained_name, False).embeddings module = TransformerEmbeddings.from_pretrained_module(pretrained_name) mapping = { val: key for key, val in module._construct_default_mapping( pretrained_module, "huggingface", {} ).items() } missing = assert_equal_parameters(pretrained_module, module, mapping=mapping) assert len(missing) == 0
def test_output_size(params): input_ids = torch.tensor([[1, 2]]) token_type_ids = torch.tensor([[1, 0]], dtype=torch.long) position_ids = torch.tensor([[0, 1]]) params["output_size"] = 7 module = TransformerEmbeddings.from_params(params) output = module(input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids) assert output.shape[-1] == 7
def test_output_size(self): input_ids = torch.tensor([[1, 2]]) token_type_ids = torch.tensor([[1, 0]], dtype=torch.long) position_ids = torch.tensor([[0, 1]]) params = copy.deepcopy(self.params_dict) params["output_size"] = 7 params = Params(params) module = TransformerEmbeddings.from_params(params) output = module.forward( input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids ) assert output.shape[-1] == 7
def test_loading_albert(): """ Albert is a special case because it includes a Linear layer in the encoder that maps the embeddings to the encoder hidden size, but we include this linear layer within our embedding layer. """ transformer_embedding = TransformerEmbeddings.from_pretrained_module( "albert-base-v2", ) albert = AutoModel.from_pretrained("albert-base-v2") assert_allclose( transformer_embedding.embeddings.word_embeddings.weight.data, albert.embeddings.word_embeddings.weight.data, ) assert_allclose( transformer_embedding.linear_transform.weight.data, albert.encoder.embedding_hidden_mapping_in.weight.data, )
def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( "bert-base-cased", relevant_module="bert.embeddings") self.separate_transformer = TransformerStack.from_pretrained_module( "bert-base-cased", relevant_module="bert.encoder", num_hidden_layers=8, strict=False, ) self.combined_transformer = TransformerStack.from_pretrained_module( "bert-base-cased", relevant_module="bert.encoder", num_hidden_layers=4, mapping={ f"layer.{l}": f"layers.{i}" for (i, l) in enumerate(range(8, 12)) }, strict=False, )
def test_forward_against_huggingface_output(self, module_name, hf_module): input_ids = torch.tensor([[1, 2]]) token_type_ids = torch.tensor([[1, 0]], dtype=torch.long) position_ids = torch.tensor([[0, 1]]) torch.manual_seed(1234) embeddings = TransformerEmbeddings.from_pretrained_module(hf_module) torch.manual_seed(1234) embeddings = embeddings.eval() # setting to eval mode to avoid non-deterministic dropout. output = embeddings.forward( input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids ) torch.manual_seed(1234) hf_module = hf_module.eval() # setting to eval mode to avoid non-deterministic dropout. hf_output = hf_module.forward( input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids ) assert torch.allclose(output, hf_output)
def test_end_to_end(self, model_name: str): data = [ ("I'm against picketing", "but I don't know how to show it."), ("I saw a human pyramid once.", "It was very unnecessary."), ] tokenizer = cached_transformers.get_tokenizer(model_name) batch = tokenizer.batch_encode_plus(data, padding=True, return_tensors="pt") with torch.no_grad(): huggingface_model = cached_transformers.get( model_name, make_copy=False).eval() huggingface_output = huggingface_model(**batch) embeddings = TransformerEmbeddings.from_pretrained_module( model_name).eval() transformer_stack = TransformerStack.from_pretrained_module( model_name).eval() pooler = TransformerPooler.from_pretrained_module( model_name).eval() batch["attention_mask"] = batch["attention_mask"].to(torch.bool) output = embeddings(**batch) output = transformer_stack(output, batch["attention_mask"]) assert_allclose( output.final_hidden_states, huggingface_output.last_hidden_state, rtol=0.0001, atol=1e-4, ) output = pooler(output.final_hidden_states) assert_allclose(output, huggingface_output.pooler_output, rtol=0.0001, atol=1e-4)
def __init__( self, vocab: Vocabulary, transformer_model: str = "roberta-large", num_labels: Optional[int] = None, label_namespace: str = "labels", override_weights_file: Optional[str] = None, **kwargs, ) -> None: super().__init__(vocab, **kwargs) transformer_kwargs = { "model_name": transformer_model, "weights_path": override_weights_file, } self.embeddings = TransformerEmbeddings.from_pretrained_module( **transformer_kwargs) self.transformer_stack = TransformerStack.from_pretrained_module( **transformer_kwargs) self.pooler = TransformerPooler.from_pretrained_module( **transformer_kwargs) self.pooler_dropout = Dropout(p=0.1) self.label_tokens = vocab.get_index_to_token_vocabulary( label_namespace) if num_labels is None: num_labels = len(self.label_tokens) self.linear_layer = torch.nn.Linear(self.pooler.get_output_dim(), num_labels) self.linear_layer.weight.data.normal_(mean=0.0, std=0.02) self.linear_layer.bias.data.zero_() from allennlp.training.metrics import CategoricalAccuracy, FBetaMeasure self.loss = torch.nn.CrossEntropyLoss() self.acc = CategoricalAccuracy() self.f1 = FBetaMeasure()
def transformer_embeddings(params): return TransformerEmbeddings.from_params(params.duplicate())
def test_loading_from_pretrained_module(pretrained_name): TransformerEmbeddings.from_pretrained_module(pretrained_name)
def test_no_token_type_layer(params): params["type_vocab_size"] = 0 module = TransformerEmbeddings.from_params(params) assert len(module.embeddings) == 2
def from_huggingface_model_name( cls, vocab: Vocabulary, model_name: str, image_feature_dim: int, image_num_hidden_layers: int, image_hidden_size: int, image_num_attention_heads: int, combined_hidden_size: int, combined_num_attention_heads: int, pooled_output_dim: int, image_intermediate_size: int, image_attention_dropout: float, image_hidden_dropout: float, image_biattention_id: List[int], text_biattention_id: List[int], text_fixed_layer: int, image_fixed_layer: int, pooled_dropout: float = 0.1, fusion_method: str = "sum", *, ignore_text: bool = False, ignore_image: bool = False, ): transformer = AutoModel.from_pretrained(model_name) # Albert (and maybe others?) has this "embedding_size", that's different from "hidden_size". # To get them to the same dimensionality, it uses a linear transform after the embedding # layer, which we need to pull out and copy here. if hasattr(transformer.config, "embedding_size"): config = transformer.config text_embeddings = TransformerEmbeddings.from_pretrained_module( transformer.embeddings, output_size=config.hidden_dim ) from transformers.models.albert.modeling_albert import AlbertModel if isinstance(transformer, AlbertModel): text_embeddings.linear_transform = deepcopy( transformer.encoder.embedding_hidden_mapping_in ) else: logger.warning( "Unknown model that uses separate embedding size; weights of the linear " f"transform will not be initialized. Model type is: {transformer.__class__}" ) else: text_embeddings = TransformerEmbeddings.from_pretrained_module(transformer.embeddings) image_embeddings = ImageFeatureEmbeddings( feature_size=image_feature_dim, embedding_size=image_hidden_size, dropout=image_hidden_dropout, ) encoder = BiModalEncoder.from_pretrained_module( pretrained_module=transformer, num_hidden_layers2=image_num_hidden_layers, hidden_size2=image_hidden_size, num_attention_heads2=image_num_attention_heads, combined_hidden_size=combined_hidden_size, combined_num_attention_heads=combined_num_attention_heads, intermediate_size2=image_intermediate_size, attention_dropout2=image_attention_dropout, hidden_dropout2=image_hidden_dropout, biattention_id1=text_biattention_id, biattention_id2=image_biattention_id, fixed_layer1=text_fixed_layer, fixed_layer2=image_fixed_layer, ) return cls( vocab=vocab, text_embeddings=text_embeddings, image_embeddings=image_embeddings, encoder=encoder, pooled_output_dim=pooled_output_dim, fusion_method=fusion_method, dropout=pooled_dropout, ignore_text=ignore_text, ignore_image=ignore_image, )