def test_layer_same_as_params(self):
        params = copy.deepcopy(self.params_dict)
        num_hidden_layers = params.pop("num_hidden_layers")
        # params = Params(params)

        torch.manual_seed(1234)
        transformer_layer = TransformerLayer(**params)
        transformer_stack_from_layer = TransformerStack(
            num_hidden_layers, transformer_layer)
        torch.manual_seed(1234)
        transformer_stack_from_params = TransformerStack(
            num_hidden_layers, **params)

        hidden_states = torch.randn(2, 3, 6)
        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])

        transformer_stack_from_layer.eval()
        transformer_stack_from_params.eval()

        torch.manual_seed(1234)
        layer_output = transformer_stack_from_layer.forward(
            hidden_states, attention_mask=attention_mask)

        torch.manual_seed(1234)
        params_output = transformer_stack_from_params.forward(
            hidden_states, attention_mask=attention_mask)

        assert torch.allclose(layer_output[0], params_output[0])
Exemple #2
0
 def __init__(self):
     super().__init__()
     self.embeddings = TransformerEmbeddings.from_pretrained_module(
         "bert-base-uncased")
     self.separate_transformer = TransformerStack.from_pretrained_module(
         "bert-base-uncased", num_hidden_layers=range(0, 8))
     self.combined_transformer = TransformerStack.from_pretrained_module(
         "bert-base-uncased",
         num_hidden_layers=range(8, 12),
     )
Exemple #3
0
            def __init__(self):
                super().__init__()
                self.embeddings = TransformerEmbeddings.from_pretrained_module(
                    pretrained)

                self.transformer = TransformerStack.from_pretrained_module(
                    pretrained, num_hidden_layers=4)
    def __init__(
        self,
        vocab: Vocabulary,
        transformer_model: str = "roberta-large",
        override_weights_file: Optional[str] = None,
        **kwargs
    ) -> None:
        super().__init__(vocab, **kwargs)
        transformer_kwargs = {
            "model_name": transformer_model,
            "weights_path": override_weights_file,
        }
        self.embeddings = TransformerEmbeddings.from_pretrained_module(**transformer_kwargs)
        self.transformer_stack = TransformerStack.from_pretrained_module(**transformer_kwargs)
        self.pooler = TransformerPooler.from_pretrained_module(**transformer_kwargs)
        self.pooler_dropout = Dropout(p=0.1)

        self.linear_layer = torch.nn.Linear(self.pooler.get_output_dim(), 1)
        self.linear_layer.weight.data.normal_(mean=0.0, std=0.02)
        self.linear_layer.bias.data.zero_()

        self.loss = torch.nn.CrossEntropyLoss()

        from allennlp.training.metrics import CategoricalAccuracy

        self.accuracy = CategoricalAccuracy()
Exemple #5
0
 def __init__(self):
     super().__init__()
     self.embeddings = TransformerEmbeddings.get_relevant_module(
         "albert-base-v2")
     self.transformer = TransformerStack.from_pretrained_module(
         "bert-base-uncased")
     # We want to tune only the embeddings, because that's our experiment.
     self.transformer.requires_grad = False
Exemple #6
0
 def __init__(self):
     super().__init__()
     self.embeddings = AutoModel.from_pretrained(
         "albert-base-v2").embeddings
     self.transformer = TransformerStack.from_pretrained_module(
         "bert-base-cased", relevant_module="bert.encoder")
     # We want to tune only the embeddings, because that's our experiment.
     self.transformer.requires_grad = False
 def test_loading_from_pretrained_weights(self):
     pretrained_module = self.pretrained.encoder
     module = TransformerStack.from_pretrained_module(pretrained_module)
     mapping = {
         val: key
         for key, val in module._construct_default_mapping(
             pretrained_module, "huggingface", {}).items()
     }
     assert_equal_parameters(pretrained_module, module, mapping)
Exemple #8
0
 def __init__(self):
     super().__init__()
     self.embeddings = TransformerEmbeddings.from_pretrained_module(
         pretrained, relevant_module="bert.embeddings")
     self.transformer = TransformerStack.from_pretrained_module(
         pretrained,
         num_hidden_layers=4,
         relevant_module="bert.encoder",
         strict=False,
     )
Exemple #9
0
 def __init__(self):
     super().__init__()
     self.embeddings = TransformerEmbeddings.from_pretrained_module(
         "bert-base-cased", relevant_module="bert.embeddings")
     self.separate_transformer = TransformerStack.from_pretrained_module(
         "bert-base-cased",
         relevant_module="bert.encoder",
         num_hidden_layers=8,
         strict=False,
     )
     self.combined_transformer = TransformerStack.from_pretrained_module(
         "bert-base-cased",
         relevant_module="bert.encoder",
         num_hidden_layers=4,
         mapping={
             f"layer.{l}": f"layers.{i}"
             for (i, l) in enumerate(range(8, 12))
         },
         strict=False,
     )
def test_transformer_stack_from_params(params):
    torch.manual_seed(SEED)
    transformer_stack = TransformerStack.from_params(params)

    # Make sure we have the right number of modules.
    modules = dict(transformer_stack.named_modules())
    assert len(modules["layers"]) == PARAMS_DICT["num_hidden_layers"]

    hidden_states = torch.randn(2, 3, 6)
    attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])

    # Make sure forward pass can run.
    torch.manual_seed(SEED)
    output = transformer_stack.forward(hidden_states,
                                       attention_mask=attention_mask)

    # Make sure we get the same results when instantiating from a single layer.
    torch.manual_seed(SEED)
    layer_params = copy.deepcopy(PARAMS_DICT)
    num_hidden_layers = layer_params.pop("num_hidden_layers")
    transformer_layer = TransformerLayer(
        **layer_params)  # type: ignore[arg-type]
    transformer_stack_from_layer = TransformerStack(
        num_hidden_layers,
        transformer_layer  # type: ignore[arg-type]
    )

    torch.manual_seed(SEED)
    from_layer_output = transformer_stack_from_layer.forward(
        hidden_states, attention_mask=attention_mask)

    assert torch.allclose(from_layer_output.final_hidden_states,
                          output.final_hidden_states)

    # Make sure forward pass raises with bad input.
    with pytest.raises(AssertionError):
        transformer_stack.forward(
            torch.randn(2, 3, 6),
            attention_mask=torch.randn(2, 3),
            encoder_hidden_states=torch.randn(2, 3, 6),
        )
    def test_cross_attention(self):
        params = copy.deepcopy(self.params_dict)
        params["add_cross_attention"] = True

        params = Params(params)

        transformer_stack = TransformerStack.from_params(params)
        modules = dict(transformer_stack.named_modules())

        assert hasattr(modules["layers.0"], "cross_attention")

        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
        transformer_stack.forward(
            torch.randn(2, 3, 6),
            attention_mask=attention_mask,
            encoder_hidden_states=torch.randn(2, 3, 6),
        )

        transformer_stack_new = TransformerStack.from_pretrained_module(
            transformer_stack, source="allennlp")

        new_modules = dict(transformer_stack_new.named_modules())
        assert hasattr(new_modules["layers.0"], "cross_attention")
Exemple #12
0
    def test_forward_against_huggingface_outputs(self, module_name, hf_module):
        hidden_states = torch.randn(2, 3, 6)
        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])

        stack = TransformerStack.from_pretrained_module(hf_module)

        torch.manual_seed(1234)
        output = stack.forward(hidden_states, attention_mask=attention_mask)
        # We do this because bert, roberta, electra process the attention_mask at the model level.
        attention_mask_hf = (attention_mask == 0).view((2, 1, 1, 3)).expand(2, 2, 3, 3) * -10e5
        torch.manual_seed(1234)
        hf_output = hf_module.forward(hidden_states, attention_mask=attention_mask_hf)

        assert torch.allclose(output[0], hf_output[0])
def test_transformer_stack_with_cross_attention(params):
    params["add_cross_attention"] = True

    transformer_stack = TransformerStack.from_params(params).eval()
    modules = dict(transformer_stack.named_modules())

    assert hasattr(modules["layers.0"], "cross_attention")

    attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
    transformer_stack.forward(
        torch.randn(2, 3, 6),
        attention_mask=attention_mask,
        encoder_hidden_states=torch.randn(2, 3, 6),
    )
Exemple #14
0
            def __init__(self, vocab, embedding_dim, hidden_size,
                         intermediate_size):
                super().__init__()
                self.embeddings = Embedding(
                    pretrained_file=embedding_file,
                    embedding_dim=embedding_dim,
                    projection_dim=hidden_size,
                    vocab=vocab,
                )

                self.transformer = TransformerStack(
                    num_hidden_layers=4,
                    hidden_size=hidden_size,
                    intermediate_size=intermediate_size,
                )
    def test_loading_from_pretrained_weights_using_model_name(
            self, pretrained_name):

        torch.manual_seed(1234)
        pretrained = cached_transformers.get(pretrained_name, False)

        if "distilbert" in pretrained_name:
            pretrained_module = pretrained.transformer
        else:
            pretrained_module = pretrained.encoder

        torch.manual_seed(1234)
        module = TransformerStack.from_pretrained_module(pretrained_name)
        mapping = {
            val: key
            for key, val in module._construct_default_mapping(
                pretrained_module, "huggingface", {}).items()
        }
        assert_equal_parameters(pretrained_module, module, mapping=mapping)

        batch_size = 1
        seq_len = 768
        dim = dict(module.named_modules()
                   )["layers.0.attention.self.query"].in_features
        hidden_states = torch.randn(batch_size, seq_len, dim)
        attention_mask = torch.randint(0, 2, (batch_size, seq_len))
        mask_reshp = (batch_size, 1, 1, dim)
        attention_mask_hf = (attention_mask == 0).view(mask_reshp)
        attention_mask_hf = attention_mask_hf.expand(batch_size, 12, seq_len,
                                                     seq_len) * -10e5

        # setting to eval mode to avoid non-deterministic dropout.
        module = module.eval()
        pretrained_module = pretrained_module.eval()

        torch.manual_seed(1234)
        output = module.forward(hidden_states,
                                attention_mask=attention_mask.squeeze())[0]
        torch.manual_seed(1234)
        hf_output = pretrained_module.forward(
            hidden_states, attention_mask=attention_mask_hf)[0]

        assert torch.allclose(output, hf_output)
    def setup_method(self):
        super().setup_method()

        self.params_dict = {
            "num_hidden_layers": 3,
            "hidden_size": 6,
            "intermediate_size": 3,
            "num_attention_heads": 2,
            "attention_dropout": 0.1,
            "hidden_dropout": 0.2,
            "activation": "relu",
        }

        params = Params(copy.deepcopy(self.params_dict))

        self.transformer_stack = TransformerStack.from_params(params)

        self.pretrained_name = "bert-base-uncased"

        self.pretrained = cached_transformers.get(self.pretrained_name, False)
    def test_loading_partial_pretrained_weights(self):

        kwargs = TransformerStack._get_input_arguments(self.pretrained.encoder)
        # The pretrained module has 12 bert layers, while the instance will have only 3.
        kwargs["num_hidden_layers"] = 3
        transformer_stack = TransformerStack(**kwargs)
        transformer_stack._load_from_pretrained_module(self.pretrained.encoder)
        mapping = {
            val: key
            for key, val in transformer_stack._construct_default_mapping(
                self.pretrained.encoder, "huggingface", {}).items()
        }
        assert_equal_parameters(
            self.pretrained.encoder,
            transformer_stack,
            mapping,
        )
def test_loading_from_pretrained(pretrained_model_name):
    transformer_stack = TransformerStack.from_pretrained_module(
        pretrained_model_name).eval()
    pretrained_module = cached_transformers.get(pretrained_model_name,
                                                True).encoder.eval()

    batch_size = 2
    seq_length = 15
    hidden_size = transformer_stack.layers[0]._hidden_size

    hidden_states = torch.randn(batch_size, seq_length, hidden_size)
    attention_mask = torch.randint(0, 2, (batch_size, seq_length))
    attention_mask_hf = attention_mask[:, None, None, :]
    attention_mask_hf = (1.0 - attention_mask_hf) * -10e5

    torch.manual_seed(SEED)
    output = transformer_stack(hidden_states, attention_mask=attention_mask)

    torch.manual_seed(SEED)
    hf_output = pretrained_module(hidden_states,
                                  attention_mask=attention_mask_hf)

    assert torch.allclose(output.final_hidden_states, hf_output[0])
Exemple #19
0
    def test_end_to_end(self, model_name: str):
        data = [
            ("I'm against picketing", "but I don't know how to show it."),
            ("I saw a human pyramid once.", "It was very unnecessary."),
        ]
        tokenizer = cached_transformers.get_tokenizer(model_name)
        batch = tokenizer.batch_encode_plus(data,
                                            padding=True,
                                            return_tensors="pt")

        with torch.no_grad():
            huggingface_model = cached_transformers.get(
                model_name, make_copy=False).eval()
            huggingface_output = huggingface_model(**batch)

            embeddings = TransformerEmbeddings.from_pretrained_module(
                model_name).eval()
            transformer_stack = TransformerStack.from_pretrained_module(
                model_name).eval()
            pooler = TransformerPooler.from_pretrained_module(
                model_name).eval()
            batch["attention_mask"] = batch["attention_mask"].to(torch.bool)
            output = embeddings(**batch)
            output = transformer_stack(output, batch["attention_mask"])

            assert_allclose(
                output.final_hidden_states,
                huggingface_output.last_hidden_state,
                rtol=0.0001,
                atol=1e-4,
            )

            output = pooler(output.final_hidden_states)
            assert_allclose(output,
                            huggingface_output.pooler_output,
                            rtol=0.0001,
                            atol=1e-4)
    def __init__(
        self,
        vocab: Vocabulary,
        transformer_model: str = "roberta-large",
        num_labels: Optional[int] = None,
        label_namespace: str = "labels",
        override_weights_file: Optional[str] = None,
        **kwargs,
    ) -> None:
        super().__init__(vocab, **kwargs)
        transformer_kwargs = {
            "model_name": transformer_model,
            "weights_path": override_weights_file,
        }
        self.embeddings = TransformerEmbeddings.from_pretrained_module(
            **transformer_kwargs)
        self.transformer_stack = TransformerStack.from_pretrained_module(
            **transformer_kwargs)
        self.pooler = TransformerPooler.from_pretrained_module(
            **transformer_kwargs)
        self.pooler_dropout = Dropout(p=0.1)

        self.label_tokens = vocab.get_index_to_token_vocabulary(
            label_namespace)
        if num_labels is None:
            num_labels = len(self.label_tokens)
        self.linear_layer = torch.nn.Linear(self.pooler.get_output_dim(),
                                            num_labels)
        self.linear_layer.weight.data.normal_(mean=0.0, std=0.02)
        self.linear_layer.bias.data.zero_()

        from allennlp.training.metrics import CategoricalAccuracy, FBetaMeasure

        self.loss = torch.nn.CrossEntropyLoss()
        self.acc = CategoricalAccuracy()
        self.f1 = FBetaMeasure()
def test_loading_partial_pretrained_weights():
    # The pretrained module has 12 bert layers, while the instance will have only 3.
    TransformerStack.from_pretrained_module("bert-base-cased",
                                            num_hidden_layers=3,
                                            strict=False)