def test_loading_from_pretrained_weights(self): pretrained_module = self.pretrained.encoder required_kwargs = [ "num_hidden_layers2", "hidden_size2", "combined_hidden_size", "intermediate_size2", "num_attention_heads2", "combined_num_attention_heads", "attention_dropout2", "hidden_dropout2", "biattention_id1", "biattention_id2", "fixed_layer1", "fixed_layer2", ] kwargs = {key: self.params_dict[key] for key in required_kwargs} module = BiModalEncoder.from_pretrained_module(pretrained_module, **kwargs) mapping = { val: key for key, val in module._construct_default_mapping( pretrained_module, "huggingface", {}).items() } assert_equal_parameters( pretrained_module, module, ignore_missing=True, mapping=mapping, )
def test_loading_from_pretrained_weights_using_model_name( self, pretrained_name): torch.manual_seed(1234) pretrained = cached_transformers.get(pretrained_name, False) if "distilbert" in pretrained_name: encoder = pretrained.transformer else: encoder = pretrained.encoder # Hacky way to get a bert layer. for i, pretrained_module in enumerate(encoder.layer.modules()): if i == 1: break # Get the self attention layer. if "distilbert" in pretrained_name: pretrained_module = pretrained_module.attention else: pretrained_module = pretrained_module.attention.self torch.manual_seed(1234) module = SelfAttention.from_pretrained_module(pretrained_name) mapping = { val: key for key, val in module._construct_default_mapping( pretrained_module, "huggingface", {}).items() } assert_equal_parameters(pretrained_module, module, mapping=mapping) batch_size = 2 seq_len = 3 dim = module.query.in_features hidden_states = torch.randn(batch_size, seq_len, dim) attention_mask = torch.randint(0, 2, (batch_size, 1, 1, seq_len)) # setting to eval mode to avoid non-deterministic dropout. module = module.eval() pretrained_module = pretrained_module.eval() torch.manual_seed(1234) output = module.forward(hidden_states, attention_mask=attention_mask.squeeze())[0] if "distilbert" in pretrained_name: torch.manual_seed(1234) hf_output = pretrained_module.forward(hidden_states, hidden_states, hidden_states, mask=attention_mask)[0] else: # The attn_mask is processed outside the self attention module in HF bert models. attention_mask = (~(attention_mask == 1)) * -10e5 torch.manual_seed(1234) hf_output = pretrained_module.forward( hidden_states, attention_mask=attention_mask)[0] assert torch.allclose(output, hf_output)
def test_loading_from_pretrained_weights(self): pretrained_module = self.pretrained.encoder module = TransformerStack.from_pretrained_module(pretrained_module) mapping = { val: key for key, val in module._construct_default_mapping( pretrained_module, "huggingface", {}).items() } assert_equal_parameters(pretrained_module, module, mapping)
def test_model_loads_weights_correctly(self): vocab = Vocabulary() vocab.add_tokens_to_namespace( ["orange", "net", "netting", "pitcher", "catcher"], "answers") model_name = "epwalsh/bert-xsmall-dummy" model = VqaVilbert.from_huggingface_model_name( vocab=vocab, model_name=model_name, image_feature_dim=2048, image_num_hidden_layers=1, image_hidden_size=6, combined_hidden_size=10, pooled_output_dim=7, image_intermediate_size=11, image_attention_dropout=0.0, image_hidden_dropout=0.0, image_biattention_id=[0, 1], text_biattention_id=[0, 1], text_fixed_layer=0, image_fixed_layer=0, image_num_attention_heads=3, combined_num_attention_heads=2, ) transformer = AutoModel.from_pretrained(model_name) # compare embedding parameters mapping = { val: key for key, val in model.backbone.text_embeddings._construct_default_mapping( transformer.embeddings, "huggingface", {}).items() } assert_equal_parameters(transformer.embeddings, model.backbone.text_embeddings, mapping=mapping) # compare encoder parameters mapping = { val: key for key, val in model.backbone.encoder._construct_default_mapping( transformer.encoder, "huggingface", {}).items() } # We ignore the new parameters for the second modality, since they won't be present # in the huggingface model. assert_equal_parameters(transformer.encoder, model.backbone.encoder, ignore_missing=True, mapping=mapping)
def test_model_loads_weights_correctly(self): from allennlp_models.vision.models.visual_entailment import VisualEntailmentModel vocab = Vocabulary() model_name = "epwalsh/bert-xsmall-dummy" model = VisualEntailmentModel.from_huggingface_model_name( vocab=vocab, model_name=model_name, image_feature_dim=2048, image_num_hidden_layers=1, image_hidden_size=3, image_num_attention_heads=1, combined_num_attention_heads=1, combined_hidden_size=5, pooled_output_dim=7, image_intermediate_size=11, image_attention_dropout=0.0, image_hidden_dropout=0.0, image_biattention_id=[0, 1], text_biattention_id=[0, 1], text_fixed_layer=0, image_fixed_layer=0, ) transformer = AutoModel.from_pretrained(model_name) # compare embedding parameters mapping = { val: key for key, val in model.backbone.text_embeddings._construct_default_mapping( transformer.embeddings, "huggingface", {} ).items() } assert_equal_parameters( transformer.embeddings, model.backbone.text_embeddings, mapping=mapping ) # compare encoder parameters mapping = { val: key for key, val in model.backbone.encoder._construct_default_mapping( transformer.encoder, "huggingface", {} ).items() } # We ignore the new parameters for the second modality, since they won't be present # in the huggingface model. assert_equal_parameters( transformer.encoder, model.backbone.encoder, ignore_missing=True, mapping=mapping )
def test_loading_from_pretrained_weights(self): # Hacky way to get a bert layer. for i, pretrained_module in enumerate( self.pretrained.encoder.layer.modules()): if i == 1: break module = TransformerLayer.from_pretrained_module(pretrained_module) mapping = { val: key for key, val in module._construct_default_mapping( pretrained_module, "huggingface", {}).items() } assert_equal_parameters(pretrained_module, module, mapping=mapping)
def test_loading_from_pretrained_weights_using_model_name( self, pretrained_name): torch.manual_seed(1234) pretrained = cached_transformers.get(pretrained_name, False) if "distilbert" in pretrained_name: encoder = pretrained.transformer else: encoder = pretrained.encoder # Hacky way to get a bert layer. for i, pretrained_module in enumerate(encoder.layer.modules()): if i == 1: break pretrained_module = pretrained_module torch.manual_seed(1234) module = TransformerLayer.from_pretrained_module(pretrained_name) mapping = { val: key for key, val in module._construct_default_mapping( pretrained_module, "huggingface", {}).items() } assert_equal_parameters(pretrained_module, module, mapping=mapping) batch_size = 2 seq_len = 768 dim = module.attention.self.query.in_features hidden_states = torch.randn(batch_size, seq_len, dim) attention_mask = torch.randint(0, 2, (batch_size, seq_len)) mask_reshp = (batch_size, 1, 1, dim) attention_mask_hf = (attention_mask == 0).view(mask_reshp).expand( batch_size, 12, seq_len, seq_len) * -10e5 # setting to eval mode to avoid non-deterministic dropout. module = module.eval() pretrained_module = pretrained_module.eval() torch.manual_seed(1234) output = module.forward(hidden_states, attention_mask=attention_mask.squeeze())[0] torch.manual_seed(1234) hf_output = pretrained_module.forward( hidden_states, attention_mask=attention_mask_hf)[0] assert torch.allclose(output, hf_output, atol=1e-04)
def test_loading_partial_pretrained_weights(self): kwargs = TransformerStack._get_input_arguments(self.pretrained.encoder) # The pretrained module has 12 bert layers, while the instance will have only 3. kwargs["num_hidden_layers"] = 3 transformer_stack = TransformerStack(**kwargs) transformer_stack._load_from_pretrained_module(self.pretrained.encoder) mapping = { val: key for key, val in transformer_stack._construct_default_mapping( self.pretrained.encoder, "huggingface", {}).items() } assert_equal_parameters( self.pretrained.encoder, transformer_stack, mapping, )
def test_loading_from_pretrained_weights_using_model_name(self, pretrained_name): pretrained_module = cached_transformers.get(pretrained_name, False).embeddings module = TransformerEmbeddings.from_pretrained_module(pretrained_name) mapping = { val: key for key, val in module._construct_default_mapping( pretrained_module, "huggingface", {} ).items() } missing = assert_equal_parameters(pretrained_module, module, mapping=mapping) assert len(missing) == 0
def test_use_selected_layers_of_bert_for_different_purposes(self): class MediumTransformer(torch.nn.Module): def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( "bert-base-uncased") self.separate_transformer = TransformerStack.from_pretrained_module( "bert-base-uncased", num_hidden_layers=range(0, 8)) self.combined_transformer = TransformerStack.from_pretrained_module( "bert-base-uncased", num_hidden_layers=range(8, 12), ) @overrides def forward( self, left_token_ids: torch.LongTensor, right_token_ids: torch.LongTensor, ): left = self.embeddings(left_token_ids) left = self.separate_transformer(left) right = self.embeddings(right_token_ids) right = self.separate_transformer(right) # combine the sequences in some meaningful way. here, we just add them. # combined = combine_masked_sequences(left, left_mask, right, right_mask) combined = left + right return self.combined_transformer(combined) medium = MediumTransformer() assert (len(medium.separate_transformer.layers)) == 8 assert (len(medium.combined_transformer.layers)) == 4 pretrained = cached_transformers.get("bert-base-uncased", False) pretrained_layers = dict(pretrained.encoder.layer.named_modules()) medium_layers = dict( medium.combined_transformer.layers.named_modules()) assert_equal_parameters(medium_layers["0"], pretrained_layers["8"], TransformerStack._huggingface_mapping) assert_equal_parameters(medium_layers["1"], pretrained_layers["9"], TransformerStack._huggingface_mapping) assert_equal_parameters(medium_layers["2"], pretrained_layers["10"], TransformerStack._huggingface_mapping) assert_equal_parameters(medium_layers["3"], pretrained_layers["11"], TransformerStack._huggingface_mapping)
def test_can_load_pretrained_weights(self): class InternalOld(torch.nn.Module): def __init__(self, inp, out): super().__init__() self.ff = torch.nn.Linear(inp, out) def forward(self, x): x = self.ff(x) return x class InternalNew(TransformerModule): def __init__(self, inp, out): super().__init__() self.linear = torch.nn.Linear(inp, out) def _construct_default_mapping(self, pretrained_module, source, mapping): # return {"linear": "ff"} return {"ff": "linear"} def forward(self, x): x = self.linear(x) return x class ExternalOld(torch.nn.Module): def __init__(self, inp, out): super().__init__() self.internal = InternalOld(inp, out) def forward(self, x): x = self.internal(x) return x class External(TransformerModule): # _huggingface_mapping = {"internal_layer": "internal"} _huggingface_mapping = {"internal": "internal_layer"} def __init__(self, inp, out): super().__init__() self.internal_layer = InternalNew(inp, out) def forward(self, x): x = self.internal_layer(x) return x iold = InternalOld(3, 5) x = torch.randn(4, 3) iold.forward(x) inew = InternalNew(3, 5) inew._load_from_pretrained_module(iold) mapping = { val: key for key, val in inew._construct_default_mapping( iold, "huggingface", {}).items() } assert_equal_parameters(iold, inew, mapping=mapping) eold = ExternalOld(3, 5) x = torch.randn(4, 3) eold.forward(x) enew = External(3, 5) enew._load_from_pretrained_module(eold) mapping = { val: key for key, val in enew._construct_default_mapping( eold, "huggingface", {}).items() } assert_equal_parameters(eold, enew, mapping=mapping)
def test_get_mapped_state_dict(self): class InternalOld(torch.nn.Module): def __init__(self, inp, out): super().__init__() self.ff = torch.nn.Linear(inp, out) self.p = Parameter(torch.randn(out, out)) self.register_buffer("b", torch.randn(inp, inp)) def forward(self, x): x = self.ff(x).matmul(self.p) return x class InternalNew(TransformerModule): _pretrained_mapping = {"ff": "linear", "p": "param", "b": "buffer"} def __init__(self, inp, out): super().__init__() self.linear = torch.nn.Linear(inp, out) self.param = Parameter(torch.randn(out, out)) self.register_buffer("buffer", torch.randn(inp, inp)) def forward(self, x): x = self.linear(x).matmul(self.param) return x class ExternalOld(torch.nn.Module): def __init__(self, inp, out): super().__init__() self.internal = InternalOld(inp, out) self.p = Parameter(torch.randn(out, out)) def forward(self, x): x = self.internal(x).matmul(self.p) return x class ExternalNew(TransformerModule): _pretrained_mapping = {"internal": "internal_layer", "p": "param"} def __init__(self, inp, out): super().__init__() self.internal_layer = InternalNew(inp, out) self.param = Parameter(torch.randn(out, out)) def forward(self, x): x = self.internal_layer(x).matmul(self.param) return x eold = ExternalOld(3, 5) state_dict_old = eold.state_dict() enew = ExternalNew(3, 5) state_dict_new = enew._get_mapped_state_dict(state_dict_old) assert set(state_dict_new.keys()) == set( [ "internal_layer.linear.weight", "internal_layer.linear.bias", "internal_layer.param", "internal_layer.buffer", "param", ] ) enew.load_state_dict(state_dict_new) x = torch.randn(4, 3) out_old = eold(x) out_new = enew(x) assert_allclose(out_old, out_new) assert_equal_parameters( eold, enew, mapping={ "internal_layer.linear.weight": "internal.ff.weight", "internal_layer.linear.bias": "internal.ff.bias", "internal_layer.param": "internal.p", "internal_layer.buffer": "internal.b", "param": "p", }, )