def test_loading_from_pretrained_weights(self):
     pretrained_module = self.pretrained.encoder
     required_kwargs = [
         "num_hidden_layers2",
         "hidden_size2",
         "combined_hidden_size",
         "intermediate_size2",
         "num_attention_heads2",
         "combined_num_attention_heads",
         "attention_dropout2",
         "hidden_dropout2",
         "biattention_id1",
         "biattention_id2",
         "fixed_layer1",
         "fixed_layer2",
     ]
     kwargs = {key: self.params_dict[key] for key in required_kwargs}
     module = BiModalEncoder.from_pretrained_module(pretrained_module,
                                                    **kwargs)
     mapping = {
         val: key
         for key, val in module._construct_default_mapping(
             pretrained_module, "huggingface", {}).items()
     }
     assert_equal_parameters(
         pretrained_module,
         module,
         ignore_missing=True,
         mapping=mapping,
     )
Exemple #2
0
    def test_loading_from_pretrained_weights_using_model_name(
            self, pretrained_name):

        torch.manual_seed(1234)
        pretrained = cached_transformers.get(pretrained_name, False)

        if "distilbert" in pretrained_name:
            encoder = pretrained.transformer
        else:
            encoder = pretrained.encoder
        # Hacky way to get a bert layer.
        for i, pretrained_module in enumerate(encoder.layer.modules()):
            if i == 1:
                break

        # Get the self attention layer.
        if "distilbert" in pretrained_name:
            pretrained_module = pretrained_module.attention
        else:
            pretrained_module = pretrained_module.attention.self

        torch.manual_seed(1234)
        module = SelfAttention.from_pretrained_module(pretrained_name)
        mapping = {
            val: key
            for key, val in module._construct_default_mapping(
                pretrained_module, "huggingface", {}).items()
        }
        assert_equal_parameters(pretrained_module, module, mapping=mapping)

        batch_size = 2
        seq_len = 3
        dim = module.query.in_features
        hidden_states = torch.randn(batch_size, seq_len, dim)
        attention_mask = torch.randint(0, 2, (batch_size, 1, 1, seq_len))

        # setting to eval mode to avoid non-deterministic dropout.
        module = module.eval()
        pretrained_module = pretrained_module.eval()

        torch.manual_seed(1234)
        output = module.forward(hidden_states,
                                attention_mask=attention_mask.squeeze())[0]
        if "distilbert" in pretrained_name:
            torch.manual_seed(1234)
            hf_output = pretrained_module.forward(hidden_states,
                                                  hidden_states,
                                                  hidden_states,
                                                  mask=attention_mask)[0]
        else:
            # The attn_mask is processed outside the self attention module in HF bert models.
            attention_mask = (~(attention_mask == 1)) * -10e5
            torch.manual_seed(1234)
            hf_output = pretrained_module.forward(
                hidden_states, attention_mask=attention_mask)[0]

        assert torch.allclose(output, hf_output)
 def test_loading_from_pretrained_weights(self):
     pretrained_module = self.pretrained.encoder
     module = TransformerStack.from_pretrained_module(pretrained_module)
     mapping = {
         val: key
         for key, val in module._construct_default_mapping(
             pretrained_module, "huggingface", {}).items()
     }
     assert_equal_parameters(pretrained_module, module, mapping)
Exemple #4
0
    def test_model_loads_weights_correctly(self):
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(
            ["orange", "net", "netting", "pitcher", "catcher"], "answers")

        model_name = "epwalsh/bert-xsmall-dummy"
        model = VqaVilbert.from_huggingface_model_name(
            vocab=vocab,
            model_name=model_name,
            image_feature_dim=2048,
            image_num_hidden_layers=1,
            image_hidden_size=6,
            combined_hidden_size=10,
            pooled_output_dim=7,
            image_intermediate_size=11,
            image_attention_dropout=0.0,
            image_hidden_dropout=0.0,
            image_biattention_id=[0, 1],
            text_biattention_id=[0, 1],
            text_fixed_layer=0,
            image_fixed_layer=0,
            image_num_attention_heads=3,
            combined_num_attention_heads=2,
        )

        transformer = AutoModel.from_pretrained(model_name)

        # compare embedding parameters
        mapping = {
            val: key
            for key, val in
            model.backbone.text_embeddings._construct_default_mapping(
                transformer.embeddings, "huggingface", {}).items()
        }
        assert_equal_parameters(transformer.embeddings,
                                model.backbone.text_embeddings,
                                mapping=mapping)

        # compare encoder parameters
        mapping = {
            val: key
            for key, val in model.backbone.encoder._construct_default_mapping(
                transformer.encoder, "huggingface", {}).items()
        }

        # We ignore the new parameters for the second modality, since they won't be present
        # in the huggingface model.
        assert_equal_parameters(transformer.encoder,
                                model.backbone.encoder,
                                ignore_missing=True,
                                mapping=mapping)
Exemple #5
0
    def test_model_loads_weights_correctly(self):
        from allennlp_models.vision.models.visual_entailment import VisualEntailmentModel

        vocab = Vocabulary()
        model_name = "epwalsh/bert-xsmall-dummy"
        model = VisualEntailmentModel.from_huggingface_model_name(
            vocab=vocab,
            model_name=model_name,
            image_feature_dim=2048,
            image_num_hidden_layers=1,
            image_hidden_size=3,
            image_num_attention_heads=1,
            combined_num_attention_heads=1,
            combined_hidden_size=5,
            pooled_output_dim=7,
            image_intermediate_size=11,
            image_attention_dropout=0.0,
            image_hidden_dropout=0.0,
            image_biattention_id=[0, 1],
            text_biattention_id=[0, 1],
            text_fixed_layer=0,
            image_fixed_layer=0,
        )

        transformer = AutoModel.from_pretrained(model_name)

        # compare embedding parameters
        mapping = {
            val: key
            for key, val in model.backbone.text_embeddings._construct_default_mapping(
                transformer.embeddings, "huggingface", {}
            ).items()
        }
        assert_equal_parameters(
            transformer.embeddings, model.backbone.text_embeddings, mapping=mapping
        )

        # compare encoder parameters
        mapping = {
            val: key
            for key, val in model.backbone.encoder._construct_default_mapping(
                transformer.encoder, "huggingface", {}
            ).items()
        }

        # We ignore the new parameters for the second modality, since they won't be present
        # in the huggingface model.
        assert_equal_parameters(
            transformer.encoder, model.backbone.encoder, ignore_missing=True, mapping=mapping
        )
Exemple #6
0
    def test_loading_from_pretrained_weights(self):

        # Hacky way to get a bert layer.
        for i, pretrained_module in enumerate(
                self.pretrained.encoder.layer.modules()):
            if i == 1:
                break

        module = TransformerLayer.from_pretrained_module(pretrained_module)
        mapping = {
            val: key
            for key, val in module._construct_default_mapping(
                pretrained_module, "huggingface", {}).items()
        }
        assert_equal_parameters(pretrained_module, module, mapping=mapping)
Exemple #7
0
    def test_loading_from_pretrained_weights_using_model_name(
            self, pretrained_name):

        torch.manual_seed(1234)
        pretrained = cached_transformers.get(pretrained_name, False)

        if "distilbert" in pretrained_name:
            encoder = pretrained.transformer
        else:
            encoder = pretrained.encoder
        # Hacky way to get a bert layer.
        for i, pretrained_module in enumerate(encoder.layer.modules()):
            if i == 1:
                break

        pretrained_module = pretrained_module

        torch.manual_seed(1234)
        module = TransformerLayer.from_pretrained_module(pretrained_name)
        mapping = {
            val: key
            for key, val in module._construct_default_mapping(
                pretrained_module, "huggingface", {}).items()
        }
        assert_equal_parameters(pretrained_module, module, mapping=mapping)

        batch_size = 2
        seq_len = 768
        dim = module.attention.self.query.in_features
        hidden_states = torch.randn(batch_size, seq_len, dim)
        attention_mask = torch.randint(0, 2, (batch_size, seq_len))
        mask_reshp = (batch_size, 1, 1, dim)
        attention_mask_hf = (attention_mask == 0).view(mask_reshp).expand(
            batch_size, 12, seq_len, seq_len) * -10e5

        # setting to eval mode to avoid non-deterministic dropout.
        module = module.eval()
        pretrained_module = pretrained_module.eval()

        torch.manual_seed(1234)
        output = module.forward(hidden_states,
                                attention_mask=attention_mask.squeeze())[0]
        torch.manual_seed(1234)
        hf_output = pretrained_module.forward(
            hidden_states, attention_mask=attention_mask_hf)[0]

        assert torch.allclose(output, hf_output, atol=1e-04)
    def test_loading_partial_pretrained_weights(self):

        kwargs = TransformerStack._get_input_arguments(self.pretrained.encoder)
        # The pretrained module has 12 bert layers, while the instance will have only 3.
        kwargs["num_hidden_layers"] = 3
        transformer_stack = TransformerStack(**kwargs)
        transformer_stack._load_from_pretrained_module(self.pretrained.encoder)
        mapping = {
            val: key
            for key, val in transformer_stack._construct_default_mapping(
                self.pretrained.encoder, "huggingface", {}).items()
        }
        assert_equal_parameters(
            self.pretrained.encoder,
            transformer_stack,
            mapping,
        )
Exemple #9
0
 def test_loading_from_pretrained_weights_using_model_name(self, pretrained_name):
     pretrained_module = cached_transformers.get(pretrained_name, False).embeddings
     module = TransformerEmbeddings.from_pretrained_module(pretrained_name)
     mapping = {
         val: key
         for key, val in module._construct_default_mapping(
             pretrained_module, "huggingface", {}
         ).items()
     }
     missing = assert_equal_parameters(pretrained_module, module, mapping=mapping)
     assert len(missing) == 0
Exemple #10
0
    def test_use_selected_layers_of_bert_for_different_purposes(self):
        class MediumTransformer(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.embeddings = TransformerEmbeddings.from_pretrained_module(
                    "bert-base-uncased")
                self.separate_transformer = TransformerStack.from_pretrained_module(
                    "bert-base-uncased", num_hidden_layers=range(0, 8))
                self.combined_transformer = TransformerStack.from_pretrained_module(
                    "bert-base-uncased",
                    num_hidden_layers=range(8, 12),
                )

            @overrides
            def forward(
                self,
                left_token_ids: torch.LongTensor,
                right_token_ids: torch.LongTensor,
            ):

                left = self.embeddings(left_token_ids)
                left = self.separate_transformer(left)

                right = self.embeddings(right_token_ids)
                right = self.separate_transformer(right)

                # combine the sequences in some meaningful way. here, we just add them.
                # combined = combine_masked_sequences(left, left_mask, right, right_mask)
                combined = left + right

                return self.combined_transformer(combined)

        medium = MediumTransformer()
        assert (len(medium.separate_transformer.layers)) == 8
        assert (len(medium.combined_transformer.layers)) == 4

        pretrained = cached_transformers.get("bert-base-uncased", False)
        pretrained_layers = dict(pretrained.encoder.layer.named_modules())

        medium_layers = dict(
            medium.combined_transformer.layers.named_modules())

        assert_equal_parameters(medium_layers["0"], pretrained_layers["8"],
                                TransformerStack._huggingface_mapping)
        assert_equal_parameters(medium_layers["1"], pretrained_layers["9"],
                                TransformerStack._huggingface_mapping)
        assert_equal_parameters(medium_layers["2"], pretrained_layers["10"],
                                TransformerStack._huggingface_mapping)
        assert_equal_parameters(medium_layers["3"], pretrained_layers["11"],
                                TransformerStack._huggingface_mapping)
    def test_can_load_pretrained_weights(self):
        class InternalOld(torch.nn.Module):
            def __init__(self, inp, out):
                super().__init__()
                self.ff = torch.nn.Linear(inp, out)

            def forward(self, x):
                x = self.ff(x)
                return x

        class InternalNew(TransformerModule):
            def __init__(self, inp, out):
                super().__init__()
                self.linear = torch.nn.Linear(inp, out)

            def _construct_default_mapping(self, pretrained_module, source,
                                           mapping):
                # return {"linear": "ff"}
                return {"ff": "linear"}

            def forward(self, x):
                x = self.linear(x)
                return x

        class ExternalOld(torch.nn.Module):
            def __init__(self, inp, out):
                super().__init__()
                self.internal = InternalOld(inp, out)

            def forward(self, x):
                x = self.internal(x)
                return x

        class External(TransformerModule):
            # _huggingface_mapping = {"internal_layer": "internal"}
            _huggingface_mapping = {"internal": "internal_layer"}

            def __init__(self, inp, out):
                super().__init__()
                self.internal_layer = InternalNew(inp, out)

            def forward(self, x):
                x = self.internal_layer(x)
                return x

        iold = InternalOld(3, 5)
        x = torch.randn(4, 3)
        iold.forward(x)
        inew = InternalNew(3, 5)
        inew._load_from_pretrained_module(iold)
        mapping = {
            val: key
            for key, val in inew._construct_default_mapping(
                iold, "huggingface", {}).items()
        }
        assert_equal_parameters(iold, inew, mapping=mapping)

        eold = ExternalOld(3, 5)
        x = torch.randn(4, 3)
        eold.forward(x)

        enew = External(3, 5)
        enew._load_from_pretrained_module(eold)
        mapping = {
            val: key
            for key, val in enew._construct_default_mapping(
                eold, "huggingface", {}).items()
        }
        assert_equal_parameters(eold, enew, mapping=mapping)
Exemple #12
0
    def test_get_mapped_state_dict(self):
        class InternalOld(torch.nn.Module):
            def __init__(self, inp, out):
                super().__init__()
                self.ff = torch.nn.Linear(inp, out)
                self.p = Parameter(torch.randn(out, out))
                self.register_buffer("b", torch.randn(inp, inp))

            def forward(self, x):
                x = self.ff(x).matmul(self.p)
                return x

        class InternalNew(TransformerModule):
            _pretrained_mapping = {"ff": "linear", "p": "param", "b": "buffer"}

            def __init__(self, inp, out):
                super().__init__()
                self.linear = torch.nn.Linear(inp, out)
                self.param = Parameter(torch.randn(out, out))
                self.register_buffer("buffer", torch.randn(inp, inp))

            def forward(self, x):
                x = self.linear(x).matmul(self.param)
                return x

        class ExternalOld(torch.nn.Module):
            def __init__(self, inp, out):
                super().__init__()
                self.internal = InternalOld(inp, out)
                self.p = Parameter(torch.randn(out, out))

            def forward(self, x):
                x = self.internal(x).matmul(self.p)
                return x

        class ExternalNew(TransformerModule):
            _pretrained_mapping = {"internal": "internal_layer", "p": "param"}

            def __init__(self, inp, out):
                super().__init__()
                self.internal_layer = InternalNew(inp, out)
                self.param = Parameter(torch.randn(out, out))

            def forward(self, x):
                x = self.internal_layer(x).matmul(self.param)
                return x

        eold = ExternalOld(3, 5)
        state_dict_old = eold.state_dict()

        enew = ExternalNew(3, 5)
        state_dict_new = enew._get_mapped_state_dict(state_dict_old)
        assert set(state_dict_new.keys()) == set(
            [
                "internal_layer.linear.weight",
                "internal_layer.linear.bias",
                "internal_layer.param",
                "internal_layer.buffer",
                "param",
            ]
        )

        enew.load_state_dict(state_dict_new)

        x = torch.randn(4, 3)
        out_old = eold(x)
        out_new = enew(x)
        assert_allclose(out_old, out_new)

        assert_equal_parameters(
            eold,
            enew,
            mapping={
                "internal_layer.linear.weight": "internal.ff.weight",
                "internal_layer.linear.bias": "internal.ff.bias",
                "internal_layer.param": "internal.p",
                "internal_layer.buffer": "internal.b",
                "param": "p",
            },
        )