예제 #1
0
 def test_get_missing_from_cache_local_files_only(self):
     with pytest.raises((OSError, ValueError)):
         cached_transformers.get(
             "bert-base-uncased",
             True,
             cache_dir=self.TEST_DIR,
             local_files_only=True,
         )
예제 #2
0
 def test_get_missing_from_cache_local_files_only(self):
     with pytest.raises(ValueError) as execinfo:
         cached_transformers.get(
             "bert-base-uncased",
             True,
             cache_dir=self.TEST_DIR,
             local_files_only=True,
         )
     assert str(execinfo.value) == (
         "Cannot find the requested files in the cached path and "
         "outgoing traffic has been disabled. To enable model "
         "look-ups and downloads online, set 'local_files_only' "
         "to False.")
예제 #3
0
    def __init__(
        self,
        pretrained_model: str,
        *,
        override_weights_file: Optional[str] = None,
        override_weights_strip_prefix: Optional[str] = None,
        load_weights: bool = True,
        requires_grad: bool = True,
        dropout: float = 0.0,
        transformer_kwargs: Optional[Dict[str, Any]] = None,
    ) -> None:
        super().__init__()

        from allennlp.common import cached_transformers

        model = cached_transformers.get(
            pretrained_model,
            False,
            override_weights_file=override_weights_file,
            override_weights_strip_prefix=override_weights_strip_prefix,
            load_weights=load_weights,
            **(transformer_kwargs or {}),
        )

        self._dropout = torch.nn.Dropout(p=dropout)

        import copy

        self.pooler = copy.deepcopy(model.pooler)
        for param in self.pooler.parameters():
            param.requires_grad = requires_grad
        self._embedding_dim = model.config.hidden_size
예제 #4
0
    def __init__(self,
                 model_name: str,
                 *,
                 max_length: int = None,
                 sub_module: str = None,
                 train_parameters: bool = True,
                 override_weights_file: Optional[str] = None,
                 override_weights_strip_prefix: Optional[str] = None) -> None:
        super().__init__()
        from allennlp.common import cached_transformers

        self.transformer_model = cached_transformers.get(
            model_name, True, override_weights_file,
            override_weights_strip_prefix)
        self.config = self.transformer_model.config
        if sub_module:
            assert hasattr(self.transformer_model, sub_module)
            self.transformer_model = getattr(self.transformer_model,
                                             sub_module)
        self._max_length = max_length
        # I'm not sure if this works for all models; open an issue on github if you find a case
        # where it doesn't work.
        self.output_dim = self.config.hidden_size

        tokenizer = PretrainedTransformerTokenizer(model_name)
        self._num_added_start_tokens = len(
            tokenizer.single_sequence_start_tokens)
        self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens)
        self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens

        if not train_parameters:
            for param in self.transformer_model.parameters():
                param.requires_grad = False
예제 #5
0
    def setup_method(self):
        super().setup_method()

        self.params_dict = {
            "num_hidden_layers1": 3,
            "num_hidden_layers2": 3,
            "hidden_size1": 12,
            "hidden_size2": 12,
            "combined_hidden_size": 12,
            "intermediate_size1": 3,
            "intermediate_size2": 3,
            "num_attention_heads1": 4,
            "num_attention_heads2": 6,
            "combined_num_attention_heads": 2,
            "attention_dropout1": 0.1,
            "hidden_dropout1": 0.2,
            "attention_dropout2": 0.1,
            "hidden_dropout2": 0.2,
            "activation": "relu",
            "biattention_id1": [1, 2],
            "biattention_id2": [1, 2],
            "fixed_layer1": 1,
            "fixed_layer2": 1,
        }

        params = Params(copy.deepcopy(self.params_dict))

        self.bimodal_encoder = BiModalEncoder.from_params(params)

        self.pretrained = cached_transformers.get("bert-base-uncased", False)
예제 #6
0
def test_layer_from_pretrained(pretrained_name, relevant_top_level_module):
    torch.manual_seed(1234)
    pretrained = cached_transformers.get(pretrained_name, False).eval()

    if "distilbert" in pretrained_name:
        encoder = pretrained.transformer
    else:
        encoder = pretrained.encoder
    # Hacky way to get a bert layer.
    pretrained_module = list(encoder.layer.modules())[1]

    torch.manual_seed(1234)
    module = TransformerLayer.from_pretrained_module(
        pretrained_name,
        relevant_module=None
        if relevant_top_level_module is None
        else f"{relevant_top_level_module}.encoder.layer.0",
    ).eval()

    batch_size = 2
    seq_length = 15
    hidden_size = module.attention.self.query.in_features

    hidden_states = torch.randn(batch_size, seq_length, hidden_size)
    attention_mask = torch.randint(0, 2, (batch_size, seq_length))
    attention_mask_hf = attention_mask[:, None, None, :]
    attention_mask_hf = (1.0 - attention_mask_hf) * -10e5

    torch.manual_seed(1234)
    output = module(hidden_states, attention_mask=attention_mask.squeeze()).hidden_states

    torch.manual_seed(1234)
    hf_output = pretrained_module(hidden_states, attention_mask=attention_mask_hf)[0]

    assert torch.allclose(output, hf_output, atol=1e-04)
예제 #7
0
    def __init__(
        self,
        model_name: str,
        *,
        max_length: int = None,
        sub_module: str = None,
        train_parameters: bool = True,
        last_layer_only: bool = True,
        override_weights_file: Optional[str] = None,
        override_weights_strip_prefix: Optional[str] = None,
        masked_language_modeling: bool = True,
    ) -> None:
        TokenEmbedder.__init__(self)  # Call the base class constructor
        tokenizer = PretrainedTransformerTokenizer(model_name)
        self.masked_language_modeling = masked_language_modeling

        if self.masked_language_modeling:
            self.config = AutoConfig.from_pretrained(model_name,
                                                     output_hidden_states=True)
            # We only need access to the HF tokenizer if we are masked language modeling
            self.tokenizer = tokenizer.tokenizer
            # The only differences when masked language modeling are:
            # 1) `output_hidden_states` must be True to get access to token embeddings.
            # 2) We need to use `AutoModelForMaskedLM` to get the correct model
            self.transformer_model = AutoModelForMaskedLM.from_pretrained(
                model_name, config=self.config)
        # Eveything after the if statement (including the else) is copied directly from:
        # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/pretrained_transformer_embedder.py
        else:
            from allennlp.common import cached_transformers

            self.transformer_model = cached_transformers.get(
                model_name, True, override_weights_file,
                override_weights_strip_prefix)
            self.config = self.transformer_model.config

        if sub_module:
            assert hasattr(self.transformer_model, sub_module)
            self.transformer_model = getattr(self.transformer_model,
                                             sub_module)
        self._max_length = max_length

        # I'm not sure if this works for all models; open an issue on github if you find a case
        # where it doesn't work.
        self.output_dim = self.config.hidden_size

        self._scalar_mix: Optional[ScalarMix] = None
        if not last_layer_only:
            self._scalar_mix = ScalarMix(self.config.num_hidden_layers)
            self.config.output_hidden_states = True

        self._num_added_start_tokens = len(
            tokenizer.single_sequence_start_tokens)
        self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens)
        self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens

        if not train_parameters:
            for param in self.transformer_model.parameters():
                param.requires_grad = False
예제 #8
0
    def test_loading_from_pretrained_weights_using_model_name(
            self, pretrained_name):

        torch.manual_seed(1234)
        pretrained = cached_transformers.get(pretrained_name, False)

        if "distilbert" in pretrained_name:
            encoder = pretrained.transformer
        else:
            encoder = pretrained.encoder
        # Hacky way to get a bert layer.
        for i, pretrained_module in enumerate(encoder.layer.modules()):
            if i == 1:
                break

        # Get the self attention layer.
        if "distilbert" in pretrained_name:
            pretrained_module = pretrained_module.attention
        else:
            pretrained_module = pretrained_module.attention.self

        torch.manual_seed(1234)
        module = SelfAttention.from_pretrained_module(pretrained_name)
        mapping = {
            val: key
            for key, val in module._construct_default_mapping(
                pretrained_module, "huggingface", {}).items()
        }
        assert_equal_parameters(pretrained_module, module, mapping=mapping)

        batch_size = 2
        seq_len = 3
        dim = module.query.in_features
        hidden_states = torch.randn(batch_size, seq_len, dim)
        attention_mask = torch.randint(0, 2, (batch_size, 1, 1, seq_len))

        # setting to eval mode to avoid non-deterministic dropout.
        module = module.eval()
        pretrained_module = pretrained_module.eval()

        torch.manual_seed(1234)
        output = module.forward(hidden_states,
                                attention_mask=attention_mask.squeeze())[0]
        if "distilbert" in pretrained_name:
            torch.manual_seed(1234)
            hf_output = pretrained_module.forward(hidden_states,
                                                  hidden_states,
                                                  hidden_states,
                                                  mask=attention_mask)[0]
        else:
            # The attn_mask is processed outside the self attention module in HF bert models.
            attention_mask = (~(attention_mask == 1)) * -10e5
            torch.manual_seed(1234)
            hf_output = pretrained_module.forward(
                hidden_states, attention_mask=attention_mask)[0]

        assert torch.allclose(output, hf_output)
예제 #9
0
    def __init__(
        self,
        model_name: str,
        *,
        max_length: int = None,
        sub_module: str = None,
        train_parameters: bool = True,
        last_layer_only: bool = True,
        override_weights_file: Optional[str] = None,
        override_weights_strip_prefix: Optional[str] = None,
        gradient_checkpointing: Optional[bool] = None,
        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
        transformer_kwargs: Optional[Dict[str, Any]] = None,
    ) -> None:
        super().__init__()
        from allennlp.common import cached_transformers

        self.transformer_model = cached_transformers.get(
            model_name,
            True,
            override_weights_file=override_weights_file,
            override_weights_strip_prefix=override_weights_strip_prefix,
            **(transformer_kwargs or {}),
        )

        if gradient_checkpointing is not None:
            self.transformer_model.config.update(
                {"gradient_checkpointing": gradient_checkpointing})

        self.config = self.transformer_model.config
        if sub_module:
            assert hasattr(self.transformer_model, sub_module)
            self.transformer_model = getattr(self.transformer_model,
                                             sub_module)
        self._max_length = max_length

        # I'm not sure if this works for all models; open an issue on github if you find a case
        # where it doesn't work.
        self.output_dim = self.config.hidden_size

        self._scalar_mix: Optional[ScalarMix] = None
        if not last_layer_only:
            self._scalar_mix = ScalarMix(self.config.num_hidden_layers)
            self.config.output_hidden_states = True

        tokenizer = PretrainedTransformerTokenizer(
            model_name,
            tokenizer_kwargs=tokenizer_kwargs,
        )
        self._num_added_start_tokens = len(
            tokenizer.single_sequence_start_tokens)
        self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens)
        self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens

        if not train_parameters:
            for param in self.transformer_model.parameters():
                param.requires_grad = False
예제 #10
0
 def test_loading_from_pretrained_weights_using_model_name(self, pretrained_name):
     pretrained_module = cached_transformers.get(pretrained_name, False).embeddings
     module = TransformerEmbeddings.from_pretrained_module(pretrained_name)
     mapping = {
         val: key
         for key, val in module._construct_default_mapping(
             pretrained_module, "huggingface", {}
         ).items()
     }
     missing = assert_equal_parameters(pretrained_module, module, mapping=mapping)
     assert len(missing) == 0
예제 #11
0
    def get_relevant_module(
        cls,
        pretrained_module: Union[str, torch.nn.Module],
        relevant_module: Optional[Union[str, List[str]]] = None,
        source: str = "huggingface",
        mapping: Optional[Dict[str, str]] = None,
        load_weights: bool = True,
    ):
        """
        Returns the relevant underlying module given a model name/object.

        # Parameters

        pretrained_module : `Union[str, torch.nn.Module]`
            Name of the transformer model containing the layer,
            or the actual layer (not the model object).
        relevant_module : `Optional[Union[str, List[str]]]`, optional
            Name of the desired module. Defaults to cls._relevant_module.
        source : `str`, optional
            Where the model came from. Default - huggingface.
        mapping : `Dict[str, str]`, optional
            Optional mapping that determines any differences in the module names
            between the class modules and the input model's modules.
            Default - cls._huggingface_mapping
        load_weights : `bool`, optional
            Whether or not to load the pretrained weights.
            Default is `True`.
        """
        if isinstance(pretrained_module, str):
            pretrained_module = cached_transformers.get(
                pretrained_module, False, load_weights=load_weights)

        relevant_module = relevant_module or cls._relevant_module

        if relevant_module is not None:
            submodules = cls._get_mapped_submodules(pretrained_module, source,
                                                    mapping)
            # If the relevant_module is not found, we assume that the pretrained_module
            # is already the relevant module.
            if isinstance(relevant_module, str):
                relevant_module = [relevant_module]
            found = False
            for module in relevant_module:
                if module in submodules:
                    pretrained_module = submodules[module]
                    found = True
                    break

            if not found:
                logger.warning(
                    "{} was not found! The submodules are: {}".format(
                        relevant_module, submodules.keys()))
        return pretrained_module
예제 #12
0
    def test_use_selected_layers_of_bert_for_different_purposes(self):
        class MediumTransformer(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.embeddings = TransformerEmbeddings.from_pretrained_module(
                    "bert-base-uncased")
                self.separate_transformer = TransformerStack.from_pretrained_module(
                    "bert-base-uncased", num_hidden_layers=range(0, 8))
                self.combined_transformer = TransformerStack.from_pretrained_module(
                    "bert-base-uncased",
                    num_hidden_layers=range(8, 12),
                )

            @overrides
            def forward(
                self,
                left_token_ids: torch.LongTensor,
                right_token_ids: torch.LongTensor,
            ):

                left = self.embeddings(left_token_ids)
                left = self.separate_transformer(left)

                right = self.embeddings(right_token_ids)
                right = self.separate_transformer(right)

                # combine the sequences in some meaningful way. here, we just add them.
                # combined = combine_masked_sequences(left, left_mask, right, right_mask)
                combined = left + right

                return self.combined_transformer(combined)

        medium = MediumTransformer()
        assert (len(medium.separate_transformer.layers)) == 8
        assert (len(medium.combined_transformer.layers)) == 4

        pretrained = cached_transformers.get("bert-base-uncased", False)
        pretrained_layers = dict(pretrained.encoder.layer.named_modules())

        medium_layers = dict(
            medium.combined_transformer.layers.named_modules())

        assert_equal_parameters(medium_layers["0"], pretrained_layers["8"],
                                TransformerStack._huggingface_mapping)
        assert_equal_parameters(medium_layers["1"], pretrained_layers["9"],
                                TransformerStack._huggingface_mapping)
        assert_equal_parameters(medium_layers["2"], pretrained_layers["10"],
                                TransformerStack._huggingface_mapping)
        assert_equal_parameters(medium_layers["3"], pretrained_layers["11"],
                                TransformerStack._huggingface_mapping)
예제 #13
0
    def test_from_pretrained_no_load_weights_local_config(self):
        config = AutoConfig.from_pretrained("epwalsh/bert-xsmall-dummy",
                                            cache_dir=self.TEST_DIR)
        self.clear_test_dir()

        # Save config to file.
        local_config_path = str(self.TEST_DIR / "local_config.json")
        config.to_json_file(local_config_path, use_diff=False)

        # Now load the model from the local config.
        _ = cached_transformers.get(local_config_path,
                                    False,
                                    load_weights=False,
                                    cache_dir=self.TEST_DIR)
        # Make sure no other files were downloaded.
        assert os.listdir(str(self.TEST_DIR)) == ["local_config.json"]
예제 #14
0
    def test_loading_from_pretrained_weights_using_model_name(
            self, pretrained_name):

        torch.manual_seed(1234)
        pretrained = cached_transformers.get(pretrained_name, False)

        if "distilbert" in pretrained_name:
            encoder = pretrained.transformer
        else:
            encoder = pretrained.encoder
        # Hacky way to get a bert layer.
        for i, pretrained_module in enumerate(encoder.layer.modules()):
            if i == 1:
                break

        pretrained_module = pretrained_module

        torch.manual_seed(1234)
        module = TransformerLayer.from_pretrained_module(pretrained_name)
        mapping = {
            val: key
            for key, val in module._construct_default_mapping(
                pretrained_module, "huggingface", {}).items()
        }
        assert_equal_parameters(pretrained_module, module, mapping=mapping)

        batch_size = 2
        seq_len = 768
        dim = module.attention.self.query.in_features
        hidden_states = torch.randn(batch_size, seq_len, dim)
        attention_mask = torch.randint(0, 2, (batch_size, seq_len))
        mask_reshp = (batch_size, 1, 1, dim)
        attention_mask_hf = (attention_mask == 0).view(mask_reshp).expand(
            batch_size, 12, seq_len, seq_len) * -10e5

        # setting to eval mode to avoid non-deterministic dropout.
        module = module.eval()
        pretrained_module = pretrained_module.eval()

        torch.manual_seed(1234)
        output = module.forward(hidden_states,
                                attention_mask=attention_mask.squeeze())[0]
        torch.manual_seed(1234)
        hf_output = pretrained_module.forward(
            hidden_states, attention_mask=attention_mask_hf)[0]

        assert torch.allclose(output, hf_output, atol=1e-04)
예제 #15
0
    def test_from_pretrained_avoids_weights_download_if_override_weights(self):
        config = AutoConfig.from_pretrained("epwalsh/bert-xsmall-dummy",
                                            cache_dir=self.TEST_DIR)
        # only download config because downloading pretrained weights in addition takes too long
        transformer = AutoModel.from_config(
            AutoConfig.from_pretrained("epwalsh/bert-xsmall-dummy",
                                       cache_dir=self.TEST_DIR))
        transformer = AutoModel.from_config(config)

        # clear cache directory
        self.clear_test_dir()

        save_weights_path = str(self.TEST_DIR / "bert_weights.pth")
        torch.save(transformer.state_dict(), save_weights_path)

        override_transformer = cached_transformers.get(
            "epwalsh/bert-xsmall-dummy",
            False,
            override_weights_file=save_weights_path,
            cache_dir=self.TEST_DIR,
        )
        # check that only three files were downloaded (filename.json, filename, filename.lock), for config.json
        # if more than three files were downloaded, then model weights were also (incorrectly) downloaded
        # NOTE: downloaded files are not explicitly detailed in Huggingface's public API,
        # so this assertion could fail in the future
        json_fnames = [
            fname for fname in os.listdir(str(self.TEST_DIR))
            if fname.endswith(".json")
        ]
        assert len(json_fnames) == 1
        json_data = json.load(open(str(self.TEST_DIR / json_fnames[0])))
        assert (
            json_data["url"] ==
            "https://huggingface.co/epwalsh/bert-xsmall-dummy/resolve/main/config.json"
        )
        resource_id = os.path.splitext(json_fnames[0])[0]
        assert set(os.listdir(str(self.TEST_DIR))) == set([
            json_fnames[0], resource_id, resource_id + ".lock",
            "bert_weights.pth"
        ])

        # check that override weights were loaded correctly
        for p1, p2 in zip(transformer.parameters(),
                          override_transformer.parameters()):
            assert p1.data.ne(p2.data).sum() == 0
예제 #16
0
    def setup_method(self):
        super().setup_method()

        self.params_dict = {
            "hidden_size": 6,
            "intermediate_size": 3,
            "num_attention_heads": 2,
            "attention_dropout": 0.1,
            "hidden_dropout": 0.2,
            "activation": "relu",
        }

        params = Params(copy.deepcopy(self.params_dict))

        self.transformer_layer = TransformerLayer.from_params(params)
        self.pretrained_name = "bert-base-uncased"

        self.pretrained = cached_transformers.get(self.pretrained_name, False)
예제 #17
0
    def test_use_first_four_layers_of_pretrained(self):
        pretrained = cached_transformers.get("bert-base-uncased", False)

        class SmallTransformer(TokenEmbedder):
            def __init__(self):
                super().__init__()
                self.embeddings = TransformerEmbeddings.from_pretrained_module(
                    pretrained)

                self.transformer = TransformerStack.from_pretrained_module(
                    pretrained, num_hidden_layers=4)

            @overrides
            def forward(self, token_ids: torch.LongTensor):
                x = self.embeddings(token_ids)
                x = self.transformer(x)
                return x

        small = SmallTransformer()
        assert len(small.transformer.layers) == 4
        small.forward(torch.LongTensor([[0, 1, 2]]))
예제 #18
0
 def test_from_pretrained_no_load_weights(self):
     _ = cached_transformers.get("epwalsh/bert-xsmall-dummy",
                                 False,
                                 load_weights=False,
                                 cache_dir=self.TEST_DIR)
     # check that only three files were downloaded (filename.json, filename, filename.lock), for config.json
     # if more than three files were downloaded, then model weights were also (incorrectly) downloaded
     # NOTE: downloaded files are not explicitly detailed in Huggingface's public API,
     # so this assertion could fail in the future
     json_fnames = [
         fname for fname in os.listdir(str(self.TEST_DIR))
         if fname.endswith(".json")
     ]
     assert len(json_fnames) == 1
     json_data = json.load(open(str(self.TEST_DIR / json_fnames[0])))
     assert (
         json_data["url"] ==
         "https://huggingface.co/epwalsh/bert-xsmall-dummy/resolve/main/config.json"
     )
     resource_id = os.path.splitext(json_fnames[0])[0]
     assert set(os.listdir(str(self.TEST_DIR))) == set(
         [json_fnames[0], resource_id, resource_id + ".lock"])
def test_loading_from_pretrained(pretrained_model_name):
    transformer_stack = TransformerStack.from_pretrained_module(
        pretrained_model_name).eval()
    pretrained_module = cached_transformers.get(pretrained_model_name,
                                                True).encoder.eval()

    batch_size = 2
    seq_length = 15
    hidden_size = transformer_stack.layers[0]._hidden_size

    hidden_states = torch.randn(batch_size, seq_length, hidden_size)
    attention_mask = torch.randint(0, 2, (batch_size, seq_length))
    attention_mask_hf = attention_mask[:, None, None, :]
    attention_mask_hf = (1.0 - attention_mask_hf) * -10e5

    torch.manual_seed(SEED)
    output = transformer_stack(hidden_states, attention_mask=attention_mask)

    torch.manual_seed(SEED)
    hf_output = pretrained_module(hidden_states,
                                  attention_mask=attention_mask_hf)

    assert torch.allclose(output.final_hidden_states, hf_output[0])
예제 #20
0
    def test_end_to_end(self, model_name: str):
        data = [
            ("I'm against picketing", "but I don't know how to show it."),
            ("I saw a human pyramid once.", "It was very unnecessary."),
        ]
        tokenizer = cached_transformers.get_tokenizer(model_name)
        batch = tokenizer.batch_encode_plus(data,
                                            padding=True,
                                            return_tensors="pt")

        with torch.no_grad():
            huggingface_model = cached_transformers.get(
                model_name, make_copy=False).eval()
            huggingface_output = huggingface_model(**batch)

            embeddings = TransformerEmbeddings.from_pretrained_module(
                model_name).eval()
            transformer_stack = TransformerStack.from_pretrained_module(
                model_name).eval()
            pooler = TransformerPooler.from_pretrained_module(
                model_name).eval()
            batch["attention_mask"] = batch["attention_mask"].to(torch.bool)
            output = embeddings(**batch)
            output = transformer_stack(output, batch["attention_mask"])

            assert_allclose(
                output.final_hidden_states,
                huggingface_output.last_hidden_state,
                rtol=0.0001,
                atol=1e-4,
            )

            output = pooler(output.final_hidden_states)
            assert_allclose(output,
                            huggingface_output.pooler_output,
                            rtol=0.0001,
                            atol=1e-4)
예제 #21
0
    def __init__(
        self,
        model_name: str,
        *,
        max_length: int = None,
        sub_module: str = None,
        train_parameters: bool = True,
        eval_mode: bool = False,
        last_layer_only: bool = True,
        override_weights_file: Optional[str] = None,
        override_weights_strip_prefix: Optional[str] = None,
        gradient_checkpointing: Optional[bool] = None,
        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
        transformer_kwargs: Optional[Dict[str, Any]] = None,
    ) -> None:
        super().__init__()
        from allennlp.common import cached_transformers

        self.transformer_model = cached_transformers.get(
            model_name,
            True,
            override_weights_file=override_weights_file,
            override_weights_strip_prefix=override_weights_strip_prefix,
            **(transformer_kwargs or {}),
        )

        if gradient_checkpointing is not None:
            self.transformer_model.config.update({"gradient_checkpointing": gradient_checkpointing})

        self.config = self.transformer_model.config
        if sub_module:
            assert hasattr(self.transformer_model, sub_module)
            self.transformer_model = getattr(self.transformer_model, sub_module)
        self._max_length = max_length

        # I'm not sure if this works for all models; open an issue on github if you find a case
        # where it doesn't work.
        self.output_dim = self.config.hidden_size

        self._scalar_mix: Optional[ScalarMix] = None
        if not last_layer_only:
            self._scalar_mix = ScalarMix(self.config.num_hidden_layers)
            self.config.output_hidden_states = True

        tokenizer = PretrainedTransformerTokenizer(
            model_name,
            tokenizer_kwargs=tokenizer_kwargs,
        )

        try:
            if self.transformer_model.get_input_embeddings().num_embeddings != len(
                tokenizer.tokenizer
            ):
                self.transformer_model.resize_token_embeddings(len(tokenizer.tokenizer))
        except NotImplementedError:
            # Can't resize for transformers models that don't implement base_model.get_input_embeddings()
            logger.warning(
                "Could not resize the token embedding matrix of the transformer model. "
                "This model does not support resizing."
            )

        self._num_added_start_tokens = len(tokenizer.single_sequence_start_tokens)
        self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens)
        self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens

        self.train_parameters = train_parameters
        if not train_parameters:
            for param in self.transformer_model.parameters():
                param.requires_grad = False

        self.eval_mode = eval_mode
        if eval_mode:
            self.transformer_model.eval()
예제 #22
0
    def test_use_selected_layers_of_bert_for_different_purposes(self):
        class MediumTransformer(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.embeddings = TransformerEmbeddings.from_pretrained_module(
                    "bert-base-cased", relevant_module="bert.embeddings")
                self.separate_transformer = TransformerStack.from_pretrained_module(
                    "bert-base-cased",
                    relevant_module="bert.encoder",
                    num_hidden_layers=8,
                    strict=False,
                )
                self.combined_transformer = TransformerStack.from_pretrained_module(
                    "bert-base-cased",
                    relevant_module="bert.encoder",
                    num_hidden_layers=4,
                    mapping={
                        f"layer.{l}": f"layers.{i}"
                        for (i, l) in enumerate(range(8, 12))
                    },
                    strict=False,
                )

            @overrides
            def forward(
                self,
                left_token_ids: torch.LongTensor,
                right_token_ids: torch.LongTensor,
            ):

                left = self.embeddings(left_token_ids)
                left = self.separate_transformer(left)

                right = self.embeddings(right_token_ids)
                right = self.separate_transformer(right)

                # combine the sequences in some meaningful way. here, we just add them.
                # combined = combine_masked_sequences(left, left_mask, right, right_mask)
                combined = left + right

                return self.combined_transformer(combined)

        medium = MediumTransformer()
        assert (len(medium.separate_transformer.layers)) == 8
        assert (len(medium.combined_transformer.layers)) == 4

        pretrained = cached_transformers.get("bert-base-cased", False)
        pretrained_layers = dict(pretrained.encoder.layer.named_modules())

        separate_layers = dict(
            medium.separate_transformer.layers.named_modules())
        assert_allclose(
            separate_layers["0"].intermediate.dense.weight.data,
            pretrained_layers["0"].intermediate.dense.weight.data,
        )

        combined_layers = dict(
            medium.combined_transformer.layers.named_modules())
        assert_allclose(
            combined_layers["0"].intermediate.dense.weight.data,
            pretrained_layers["8"].intermediate.dense.weight.data,
        )
        assert_allclose(
            combined_layers["1"].intermediate.dense.weight.data,
            pretrained_layers["9"].intermediate.dense.weight.data,
        )
        assert_allclose(
            combined_layers["2"].intermediate.dense.weight.data,
            pretrained_layers["10"].intermediate.dense.weight.data,
        )
        assert_allclose(
            combined_layers["3"].intermediate.dense.weight.data,
            pretrained_layers["11"].intermediate.dense.weight.data,
        )
    def __init__(
        self,
        model_name: str,
        *,
        max_length: int = None,
        sub_module: str = None,
        train_parameters: bool = True,
        last_layer_only: bool = True,
        override_weights_file: Optional[str] = None,
        override_weights_strip_prefix: Optional[str] = None,
        gradient_checkpointing: Optional[bool] = None,
        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
        transformer_kwargs: Optional[Dict[str, Any]] = None,
        masked_language_modeling: bool = True,
        load_directory: Optional[str] = None
    ) -> None:
        TokenEmbedder.__init__(self)  # Call the base class constructor
        tokenizer = PretrainedTransformerTokenizer(model_name, tokenizer_kwargs=tokenizer_kwargs)
        self.masked_language_modeling = masked_language_modeling

        if self.masked_language_modeling:
            self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
            # We only need access to the HF tokenizer if we are masked language modeling
            self.tokenizer = tokenizer.tokenizer
            # The only differences when masked language modeling are:
            # 1) `output_hidden_states` must be True to get access to token embeddings.
            # 2) We need to use `AutoModelForMaskedLM` to get the correct model
            self.transformer_model = AutoModelForMaskedLM.from_pretrained(
            # self.transformer_model = RobertaForAugment.from_pretrained()
                model_name, config=self.config, **(transformer_kwargs or {})
            )

            if load_directory is not None:
                print("Loading Model from:", load_directory)
                state = torch.load(load_directory)
                model_dict = self.transformer_model.state_dict()
                # ckpt__dict = state['state_dict']
                state = {k: v for k, v in state.items() if k in model_dict}
                model_dict.update(state) 
                self.transformer_model.load_state_dict(model_dict, strict=False)
                print("Loading Model from:", load_directory, "...Finished.")
        # Eveything after the if statement (including the else) is copied directly from:
        # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/pretrained_transformer_embedder.py
        else:
            from allennlp.common import cached_transformers

            self.transformer_model = cached_transformers.get(
                model_name, True, override_weights_file, override_weights_strip_prefix
            )
            self.config = self.transformer_model.config

        if gradient_checkpointing is not None:
            self.transformer_model.config.update({"gradient_checkpointing": gradient_checkpointing})

        if sub_module:
            assert hasattr(self.transformer_model, sub_module)
            self.transformer_model = getattr(self.transformer_model, sub_module)

        # print("max_length", max_length)
        self._max_length = max_length

        # I'm not sure if this works for all models; open an issue on github if you find a case
        # where it doesn't work.
        self.output_dim = self.config.hidden_size

        self._scalar_mix: Optional[ScalarMix] = None
        if not last_layer_only:
            self._scalar_mix = ScalarMix(self.config.num_hidden_layers)
            self.config.output_hidden_states = True

        self._num_added_start_tokens = len(tokenizer.single_sequence_start_tokens)
        self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens)
        self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens

        self.encoder = BertEncoder(self.config)
        self.layer = torch.nn.ModuleList([BertLayer(self.config)
                                    for _ in range(self.config.num_hidden_layers)])
        self.embeddings = BertEmbeddings(self.config)
        self.output_hidden_states = self.config.output_hidden_states

        if not train_parameters:
            for param in self.transformer_model.parameters():
                param.requires_grad = False