def test_layer_same_as_params(self): params = copy.deepcopy(self.params_dict) num_hidden_layers = params.pop("num_hidden_layers") # params = Params(params) torch.manual_seed(1234) transformer_layer = TransformerLayer(**params) transformer_stack_from_layer = TransformerStack( num_hidden_layers, transformer_layer) torch.manual_seed(1234) transformer_stack_from_params = TransformerStack( num_hidden_layers, **params) hidden_states = torch.randn(2, 3, 6) attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]]) transformer_stack_from_layer.eval() transformer_stack_from_params.eval() torch.manual_seed(1234) layer_output = transformer_stack_from_layer.forward( hidden_states, attention_mask=attention_mask) torch.manual_seed(1234) params_output = transformer_stack_from_params.forward( hidden_states, attention_mask=attention_mask) assert torch.allclose(layer_output[0], params_output[0])
def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( "bert-base-uncased") self.separate_transformer = TransformerStack.from_pretrained_module( "bert-base-uncased", num_hidden_layers=range(0, 8)) self.combined_transformer = TransformerStack.from_pretrained_module( "bert-base-uncased", num_hidden_layers=range(8, 12), )
def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( pretrained) self.transformer = TransformerStack.from_pretrained_module( pretrained, num_hidden_layers=4)
def __init__( self, vocab: Vocabulary, transformer_model: str = "roberta-large", override_weights_file: Optional[str] = None, **kwargs ) -> None: super().__init__(vocab, **kwargs) transformer_kwargs = { "model_name": transformer_model, "weights_path": override_weights_file, } self.embeddings = TransformerEmbeddings.from_pretrained_module(**transformer_kwargs) self.transformer_stack = TransformerStack.from_pretrained_module(**transformer_kwargs) self.pooler = TransformerPooler.from_pretrained_module(**transformer_kwargs) self.pooler_dropout = Dropout(p=0.1) self.linear_layer = torch.nn.Linear(self.pooler.get_output_dim(), 1) self.linear_layer.weight.data.normal_(mean=0.0, std=0.02) self.linear_layer.bias.data.zero_() self.loss = torch.nn.CrossEntropyLoss() from allennlp.training.metrics import CategoricalAccuracy self.accuracy = CategoricalAccuracy()
def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.get_relevant_module( "albert-base-v2") self.transformer = TransformerStack.from_pretrained_module( "bert-base-uncased") # We want to tune only the embeddings, because that's our experiment. self.transformer.requires_grad = False
def __init__(self): super().__init__() self.embeddings = AutoModel.from_pretrained( "albert-base-v2").embeddings self.transformer = TransformerStack.from_pretrained_module( "bert-base-cased", relevant_module="bert.encoder") # We want to tune only the embeddings, because that's our experiment. self.transformer.requires_grad = False
def test_loading_from_pretrained_weights(self): pretrained_module = self.pretrained.encoder module = TransformerStack.from_pretrained_module(pretrained_module) mapping = { val: key for key, val in module._construct_default_mapping( pretrained_module, "huggingface", {}).items() } assert_equal_parameters(pretrained_module, module, mapping)
def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( pretrained, relevant_module="bert.embeddings") self.transformer = TransformerStack.from_pretrained_module( pretrained, num_hidden_layers=4, relevant_module="bert.encoder", strict=False, )
def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( "bert-base-cased", relevant_module="bert.embeddings") self.separate_transformer = TransformerStack.from_pretrained_module( "bert-base-cased", relevant_module="bert.encoder", num_hidden_layers=8, strict=False, ) self.combined_transformer = TransformerStack.from_pretrained_module( "bert-base-cased", relevant_module="bert.encoder", num_hidden_layers=4, mapping={ f"layer.{l}": f"layers.{i}" for (i, l) in enumerate(range(8, 12)) }, strict=False, )
def test_transformer_stack_from_params(params): torch.manual_seed(SEED) transformer_stack = TransformerStack.from_params(params) # Make sure we have the right number of modules. modules = dict(transformer_stack.named_modules()) assert len(modules["layers"]) == PARAMS_DICT["num_hidden_layers"] hidden_states = torch.randn(2, 3, 6) attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]]) # Make sure forward pass can run. torch.manual_seed(SEED) output = transformer_stack.forward(hidden_states, attention_mask=attention_mask) # Make sure we get the same results when instantiating from a single layer. torch.manual_seed(SEED) layer_params = copy.deepcopy(PARAMS_DICT) num_hidden_layers = layer_params.pop("num_hidden_layers") transformer_layer = TransformerLayer( **layer_params) # type: ignore[arg-type] transformer_stack_from_layer = TransformerStack( num_hidden_layers, transformer_layer # type: ignore[arg-type] ) torch.manual_seed(SEED) from_layer_output = transformer_stack_from_layer.forward( hidden_states, attention_mask=attention_mask) assert torch.allclose(from_layer_output.final_hidden_states, output.final_hidden_states) # Make sure forward pass raises with bad input. with pytest.raises(AssertionError): transformer_stack.forward( torch.randn(2, 3, 6), attention_mask=torch.randn(2, 3), encoder_hidden_states=torch.randn(2, 3, 6), )
def test_cross_attention(self): params = copy.deepcopy(self.params_dict) params["add_cross_attention"] = True params = Params(params) transformer_stack = TransformerStack.from_params(params) modules = dict(transformer_stack.named_modules()) assert hasattr(modules["layers.0"], "cross_attention") attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]]) transformer_stack.forward( torch.randn(2, 3, 6), attention_mask=attention_mask, encoder_hidden_states=torch.randn(2, 3, 6), ) transformer_stack_new = TransformerStack.from_pretrained_module( transformer_stack, source="allennlp") new_modules = dict(transformer_stack_new.named_modules()) assert hasattr(new_modules["layers.0"], "cross_attention")
def test_forward_against_huggingface_outputs(self, module_name, hf_module): hidden_states = torch.randn(2, 3, 6) attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]]) stack = TransformerStack.from_pretrained_module(hf_module) torch.manual_seed(1234) output = stack.forward(hidden_states, attention_mask=attention_mask) # We do this because bert, roberta, electra process the attention_mask at the model level. attention_mask_hf = (attention_mask == 0).view((2, 1, 1, 3)).expand(2, 2, 3, 3) * -10e5 torch.manual_seed(1234) hf_output = hf_module.forward(hidden_states, attention_mask=attention_mask_hf) assert torch.allclose(output[0], hf_output[0])
def test_transformer_stack_with_cross_attention(params): params["add_cross_attention"] = True transformer_stack = TransformerStack.from_params(params).eval() modules = dict(transformer_stack.named_modules()) assert hasattr(modules["layers.0"], "cross_attention") attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]]) transformer_stack.forward( torch.randn(2, 3, 6), attention_mask=attention_mask, encoder_hidden_states=torch.randn(2, 3, 6), )
def __init__(self, vocab, embedding_dim, hidden_size, intermediate_size): super().__init__() self.embeddings = Embedding( pretrained_file=embedding_file, embedding_dim=embedding_dim, projection_dim=hidden_size, vocab=vocab, ) self.transformer = TransformerStack( num_hidden_layers=4, hidden_size=hidden_size, intermediate_size=intermediate_size, )
def test_loading_from_pretrained_weights_using_model_name( self, pretrained_name): torch.manual_seed(1234) pretrained = cached_transformers.get(pretrained_name, False) if "distilbert" in pretrained_name: pretrained_module = pretrained.transformer else: pretrained_module = pretrained.encoder torch.manual_seed(1234) module = TransformerStack.from_pretrained_module(pretrained_name) mapping = { val: key for key, val in module._construct_default_mapping( pretrained_module, "huggingface", {}).items() } assert_equal_parameters(pretrained_module, module, mapping=mapping) batch_size = 1 seq_len = 768 dim = dict(module.named_modules() )["layers.0.attention.self.query"].in_features hidden_states = torch.randn(batch_size, seq_len, dim) attention_mask = torch.randint(0, 2, (batch_size, seq_len)) mask_reshp = (batch_size, 1, 1, dim) attention_mask_hf = (attention_mask == 0).view(mask_reshp) attention_mask_hf = attention_mask_hf.expand(batch_size, 12, seq_len, seq_len) * -10e5 # setting to eval mode to avoid non-deterministic dropout. module = module.eval() pretrained_module = pretrained_module.eval() torch.manual_seed(1234) output = module.forward(hidden_states, attention_mask=attention_mask.squeeze())[0] torch.manual_seed(1234) hf_output = pretrained_module.forward( hidden_states, attention_mask=attention_mask_hf)[0] assert torch.allclose(output, hf_output)
def setup_method(self): super().setup_method() self.params_dict = { "num_hidden_layers": 3, "hidden_size": 6, "intermediate_size": 3, "num_attention_heads": 2, "attention_dropout": 0.1, "hidden_dropout": 0.2, "activation": "relu", } params = Params(copy.deepcopy(self.params_dict)) self.transformer_stack = TransformerStack.from_params(params) self.pretrained_name = "bert-base-uncased" self.pretrained = cached_transformers.get(self.pretrained_name, False)
def test_loading_partial_pretrained_weights(self): kwargs = TransformerStack._get_input_arguments(self.pretrained.encoder) # The pretrained module has 12 bert layers, while the instance will have only 3. kwargs["num_hidden_layers"] = 3 transformer_stack = TransformerStack(**kwargs) transformer_stack._load_from_pretrained_module(self.pretrained.encoder) mapping = { val: key for key, val in transformer_stack._construct_default_mapping( self.pretrained.encoder, "huggingface", {}).items() } assert_equal_parameters( self.pretrained.encoder, transformer_stack, mapping, )
def test_loading_from_pretrained(pretrained_model_name): transformer_stack = TransformerStack.from_pretrained_module( pretrained_model_name).eval() pretrained_module = cached_transformers.get(pretrained_model_name, True).encoder.eval() batch_size = 2 seq_length = 15 hidden_size = transformer_stack.layers[0]._hidden_size hidden_states = torch.randn(batch_size, seq_length, hidden_size) attention_mask = torch.randint(0, 2, (batch_size, seq_length)) attention_mask_hf = attention_mask[:, None, None, :] attention_mask_hf = (1.0 - attention_mask_hf) * -10e5 torch.manual_seed(SEED) output = transformer_stack(hidden_states, attention_mask=attention_mask) torch.manual_seed(SEED) hf_output = pretrained_module(hidden_states, attention_mask=attention_mask_hf) assert torch.allclose(output.final_hidden_states, hf_output[0])
def test_end_to_end(self, model_name: str): data = [ ("I'm against picketing", "but I don't know how to show it."), ("I saw a human pyramid once.", "It was very unnecessary."), ] tokenizer = cached_transformers.get_tokenizer(model_name) batch = tokenizer.batch_encode_plus(data, padding=True, return_tensors="pt") with torch.no_grad(): huggingface_model = cached_transformers.get( model_name, make_copy=False).eval() huggingface_output = huggingface_model(**batch) embeddings = TransformerEmbeddings.from_pretrained_module( model_name).eval() transformer_stack = TransformerStack.from_pretrained_module( model_name).eval() pooler = TransformerPooler.from_pretrained_module( model_name).eval() batch["attention_mask"] = batch["attention_mask"].to(torch.bool) output = embeddings(**batch) output = transformer_stack(output, batch["attention_mask"]) assert_allclose( output.final_hidden_states, huggingface_output.last_hidden_state, rtol=0.0001, atol=1e-4, ) output = pooler(output.final_hidden_states) assert_allclose(output, huggingface_output.pooler_output, rtol=0.0001, atol=1e-4)
def __init__( self, vocab: Vocabulary, transformer_model: str = "roberta-large", num_labels: Optional[int] = None, label_namespace: str = "labels", override_weights_file: Optional[str] = None, **kwargs, ) -> None: super().__init__(vocab, **kwargs) transformer_kwargs = { "model_name": transformer_model, "weights_path": override_weights_file, } self.embeddings = TransformerEmbeddings.from_pretrained_module( **transformer_kwargs) self.transformer_stack = TransformerStack.from_pretrained_module( **transformer_kwargs) self.pooler = TransformerPooler.from_pretrained_module( **transformer_kwargs) self.pooler_dropout = Dropout(p=0.1) self.label_tokens = vocab.get_index_to_token_vocabulary( label_namespace) if num_labels is None: num_labels = len(self.label_tokens) self.linear_layer = torch.nn.Linear(self.pooler.get_output_dim(), num_labels) self.linear_layer.weight.data.normal_(mean=0.0, std=0.02) self.linear_layer.bias.data.zero_() from allennlp.training.metrics import CategoricalAccuracy, FBetaMeasure self.loss = torch.nn.CrossEntropyLoss() self.acc = CategoricalAccuracy() self.f1 = FBetaMeasure()
def test_loading_partial_pretrained_weights(): # The pretrained module has 12 bert layers, while the instance will have only 3. TransformerStack.from_pretrained_module("bert-base-cased", num_hidden_layers=3, strict=False)