def test_get_missing_from_cache_local_files_only(self): with pytest.raises((OSError, ValueError)): cached_transformers.get( "bert-base-uncased", True, cache_dir=self.TEST_DIR, local_files_only=True, )
def test_get_missing_from_cache_local_files_only(self): with pytest.raises(ValueError) as execinfo: cached_transformers.get( "bert-base-uncased", True, cache_dir=self.TEST_DIR, local_files_only=True, ) assert str(execinfo.value) == ( "Cannot find the requested files in the cached path and " "outgoing traffic has been disabled. To enable model " "look-ups and downloads online, set 'local_files_only' " "to False.")
def __init__( self, pretrained_model: str, *, override_weights_file: Optional[str] = None, override_weights_strip_prefix: Optional[str] = None, load_weights: bool = True, requires_grad: bool = True, dropout: float = 0.0, transformer_kwargs: Optional[Dict[str, Any]] = None, ) -> None: super().__init__() from allennlp.common import cached_transformers model = cached_transformers.get( pretrained_model, False, override_weights_file=override_weights_file, override_weights_strip_prefix=override_weights_strip_prefix, load_weights=load_weights, **(transformer_kwargs or {}), ) self._dropout = torch.nn.Dropout(p=dropout) import copy self.pooler = copy.deepcopy(model.pooler) for param in self.pooler.parameters(): param.requires_grad = requires_grad self._embedding_dim = model.config.hidden_size
def __init__(self, model_name: str, *, max_length: int = None, sub_module: str = None, train_parameters: bool = True, override_weights_file: Optional[str] = None, override_weights_strip_prefix: Optional[str] = None) -> None: super().__init__() from allennlp.common import cached_transformers self.transformer_model = cached_transformers.get( model_name, True, override_weights_file, override_weights_strip_prefix) self.config = self.transformer_model.config if sub_module: assert hasattr(self.transformer_model, sub_module) self.transformer_model = getattr(self.transformer_model, sub_module) self._max_length = max_length # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.config.hidden_size tokenizer = PretrainedTransformerTokenizer(model_name) self._num_added_start_tokens = len( tokenizer.single_sequence_start_tokens) self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens) self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens if not train_parameters: for param in self.transformer_model.parameters(): param.requires_grad = False
def setup_method(self): super().setup_method() self.params_dict = { "num_hidden_layers1": 3, "num_hidden_layers2": 3, "hidden_size1": 12, "hidden_size2": 12, "combined_hidden_size": 12, "intermediate_size1": 3, "intermediate_size2": 3, "num_attention_heads1": 4, "num_attention_heads2": 6, "combined_num_attention_heads": 2, "attention_dropout1": 0.1, "hidden_dropout1": 0.2, "attention_dropout2": 0.1, "hidden_dropout2": 0.2, "activation": "relu", "biattention_id1": [1, 2], "biattention_id2": [1, 2], "fixed_layer1": 1, "fixed_layer2": 1, } params = Params(copy.deepcopy(self.params_dict)) self.bimodal_encoder = BiModalEncoder.from_params(params) self.pretrained = cached_transformers.get("bert-base-uncased", False)
def test_layer_from_pretrained(pretrained_name, relevant_top_level_module): torch.manual_seed(1234) pretrained = cached_transformers.get(pretrained_name, False).eval() if "distilbert" in pretrained_name: encoder = pretrained.transformer else: encoder = pretrained.encoder # Hacky way to get a bert layer. pretrained_module = list(encoder.layer.modules())[1] torch.manual_seed(1234) module = TransformerLayer.from_pretrained_module( pretrained_name, relevant_module=None if relevant_top_level_module is None else f"{relevant_top_level_module}.encoder.layer.0", ).eval() batch_size = 2 seq_length = 15 hidden_size = module.attention.self.query.in_features hidden_states = torch.randn(batch_size, seq_length, hidden_size) attention_mask = torch.randint(0, 2, (batch_size, seq_length)) attention_mask_hf = attention_mask[:, None, None, :] attention_mask_hf = (1.0 - attention_mask_hf) * -10e5 torch.manual_seed(1234) output = module(hidden_states, attention_mask=attention_mask.squeeze()).hidden_states torch.manual_seed(1234) hf_output = pretrained_module(hidden_states, attention_mask=attention_mask_hf)[0] assert torch.allclose(output, hf_output, atol=1e-04)
def __init__( self, model_name: str, *, max_length: int = None, sub_module: str = None, train_parameters: bool = True, last_layer_only: bool = True, override_weights_file: Optional[str] = None, override_weights_strip_prefix: Optional[str] = None, masked_language_modeling: bool = True, ) -> None: TokenEmbedder.__init__(self) # Call the base class constructor tokenizer = PretrainedTransformerTokenizer(model_name) self.masked_language_modeling = masked_language_modeling if self.masked_language_modeling: self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True) # We only need access to the HF tokenizer if we are masked language modeling self.tokenizer = tokenizer.tokenizer # The only differences when masked language modeling are: # 1) `output_hidden_states` must be True to get access to token embeddings. # 2) We need to use `AutoModelForMaskedLM` to get the correct model self.transformer_model = AutoModelForMaskedLM.from_pretrained( model_name, config=self.config) # Eveything after the if statement (including the else) is copied directly from: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/pretrained_transformer_embedder.py else: from allennlp.common import cached_transformers self.transformer_model = cached_transformers.get( model_name, True, override_weights_file, override_weights_strip_prefix) self.config = self.transformer_model.config if sub_module: assert hasattr(self.transformer_model, sub_module) self.transformer_model = getattr(self.transformer_model, sub_module) self._max_length = max_length # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.config.hidden_size self._scalar_mix: Optional[ScalarMix] = None if not last_layer_only: self._scalar_mix = ScalarMix(self.config.num_hidden_layers) self.config.output_hidden_states = True self._num_added_start_tokens = len( tokenizer.single_sequence_start_tokens) self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens) self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens if not train_parameters: for param in self.transformer_model.parameters(): param.requires_grad = False
def test_loading_from_pretrained_weights_using_model_name( self, pretrained_name): torch.manual_seed(1234) pretrained = cached_transformers.get(pretrained_name, False) if "distilbert" in pretrained_name: encoder = pretrained.transformer else: encoder = pretrained.encoder # Hacky way to get a bert layer. for i, pretrained_module in enumerate(encoder.layer.modules()): if i == 1: break # Get the self attention layer. if "distilbert" in pretrained_name: pretrained_module = pretrained_module.attention else: pretrained_module = pretrained_module.attention.self torch.manual_seed(1234) module = SelfAttention.from_pretrained_module(pretrained_name) mapping = { val: key for key, val in module._construct_default_mapping( pretrained_module, "huggingface", {}).items() } assert_equal_parameters(pretrained_module, module, mapping=mapping) batch_size = 2 seq_len = 3 dim = module.query.in_features hidden_states = torch.randn(batch_size, seq_len, dim) attention_mask = torch.randint(0, 2, (batch_size, 1, 1, seq_len)) # setting to eval mode to avoid non-deterministic dropout. module = module.eval() pretrained_module = pretrained_module.eval() torch.manual_seed(1234) output = module.forward(hidden_states, attention_mask=attention_mask.squeeze())[0] if "distilbert" in pretrained_name: torch.manual_seed(1234) hf_output = pretrained_module.forward(hidden_states, hidden_states, hidden_states, mask=attention_mask)[0] else: # The attn_mask is processed outside the self attention module in HF bert models. attention_mask = (~(attention_mask == 1)) * -10e5 torch.manual_seed(1234) hf_output = pretrained_module.forward( hidden_states, attention_mask=attention_mask)[0] assert torch.allclose(output, hf_output)
def __init__( self, model_name: str, *, max_length: int = None, sub_module: str = None, train_parameters: bool = True, last_layer_only: bool = True, override_weights_file: Optional[str] = None, override_weights_strip_prefix: Optional[str] = None, gradient_checkpointing: Optional[bool] = None, tokenizer_kwargs: Optional[Dict[str, Any]] = None, transformer_kwargs: Optional[Dict[str, Any]] = None, ) -> None: super().__init__() from allennlp.common import cached_transformers self.transformer_model = cached_transformers.get( model_name, True, override_weights_file=override_weights_file, override_weights_strip_prefix=override_weights_strip_prefix, **(transformer_kwargs or {}), ) if gradient_checkpointing is not None: self.transformer_model.config.update( {"gradient_checkpointing": gradient_checkpointing}) self.config = self.transformer_model.config if sub_module: assert hasattr(self.transformer_model, sub_module) self.transformer_model = getattr(self.transformer_model, sub_module) self._max_length = max_length # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.config.hidden_size self._scalar_mix: Optional[ScalarMix] = None if not last_layer_only: self._scalar_mix = ScalarMix(self.config.num_hidden_layers) self.config.output_hidden_states = True tokenizer = PretrainedTransformerTokenizer( model_name, tokenizer_kwargs=tokenizer_kwargs, ) self._num_added_start_tokens = len( tokenizer.single_sequence_start_tokens) self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens) self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens if not train_parameters: for param in self.transformer_model.parameters(): param.requires_grad = False
def test_loading_from_pretrained_weights_using_model_name(self, pretrained_name): pretrained_module = cached_transformers.get(pretrained_name, False).embeddings module = TransformerEmbeddings.from_pretrained_module(pretrained_name) mapping = { val: key for key, val in module._construct_default_mapping( pretrained_module, "huggingface", {} ).items() } missing = assert_equal_parameters(pretrained_module, module, mapping=mapping) assert len(missing) == 0
def get_relevant_module( cls, pretrained_module: Union[str, torch.nn.Module], relevant_module: Optional[Union[str, List[str]]] = None, source: str = "huggingface", mapping: Optional[Dict[str, str]] = None, load_weights: bool = True, ): """ Returns the relevant underlying module given a model name/object. # Parameters pretrained_module : `Union[str, torch.nn.Module]` Name of the transformer model containing the layer, or the actual layer (not the model object). relevant_module : `Optional[Union[str, List[str]]]`, optional Name of the desired module. Defaults to cls._relevant_module. source : `str`, optional Where the model came from. Default - huggingface. mapping : `Dict[str, str]`, optional Optional mapping that determines any differences in the module names between the class modules and the input model's modules. Default - cls._huggingface_mapping load_weights : `bool`, optional Whether or not to load the pretrained weights. Default is `True`. """ if isinstance(pretrained_module, str): pretrained_module = cached_transformers.get( pretrained_module, False, load_weights=load_weights) relevant_module = relevant_module or cls._relevant_module if relevant_module is not None: submodules = cls._get_mapped_submodules(pretrained_module, source, mapping) # If the relevant_module is not found, we assume that the pretrained_module # is already the relevant module. if isinstance(relevant_module, str): relevant_module = [relevant_module] found = False for module in relevant_module: if module in submodules: pretrained_module = submodules[module] found = True break if not found: logger.warning( "{} was not found! The submodules are: {}".format( relevant_module, submodules.keys())) return pretrained_module
def test_use_selected_layers_of_bert_for_different_purposes(self): class MediumTransformer(torch.nn.Module): def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( "bert-base-uncased") self.separate_transformer = TransformerStack.from_pretrained_module( "bert-base-uncased", num_hidden_layers=range(0, 8)) self.combined_transformer = TransformerStack.from_pretrained_module( "bert-base-uncased", num_hidden_layers=range(8, 12), ) @overrides def forward( self, left_token_ids: torch.LongTensor, right_token_ids: torch.LongTensor, ): left = self.embeddings(left_token_ids) left = self.separate_transformer(left) right = self.embeddings(right_token_ids) right = self.separate_transformer(right) # combine the sequences in some meaningful way. here, we just add them. # combined = combine_masked_sequences(left, left_mask, right, right_mask) combined = left + right return self.combined_transformer(combined) medium = MediumTransformer() assert (len(medium.separate_transformer.layers)) == 8 assert (len(medium.combined_transformer.layers)) == 4 pretrained = cached_transformers.get("bert-base-uncased", False) pretrained_layers = dict(pretrained.encoder.layer.named_modules()) medium_layers = dict( medium.combined_transformer.layers.named_modules()) assert_equal_parameters(medium_layers["0"], pretrained_layers["8"], TransformerStack._huggingface_mapping) assert_equal_parameters(medium_layers["1"], pretrained_layers["9"], TransformerStack._huggingface_mapping) assert_equal_parameters(medium_layers["2"], pretrained_layers["10"], TransformerStack._huggingface_mapping) assert_equal_parameters(medium_layers["3"], pretrained_layers["11"], TransformerStack._huggingface_mapping)
def test_from_pretrained_no_load_weights_local_config(self): config = AutoConfig.from_pretrained("epwalsh/bert-xsmall-dummy", cache_dir=self.TEST_DIR) self.clear_test_dir() # Save config to file. local_config_path = str(self.TEST_DIR / "local_config.json") config.to_json_file(local_config_path, use_diff=False) # Now load the model from the local config. _ = cached_transformers.get(local_config_path, False, load_weights=False, cache_dir=self.TEST_DIR) # Make sure no other files were downloaded. assert os.listdir(str(self.TEST_DIR)) == ["local_config.json"]
def test_loading_from_pretrained_weights_using_model_name( self, pretrained_name): torch.manual_seed(1234) pretrained = cached_transformers.get(pretrained_name, False) if "distilbert" in pretrained_name: encoder = pretrained.transformer else: encoder = pretrained.encoder # Hacky way to get a bert layer. for i, pretrained_module in enumerate(encoder.layer.modules()): if i == 1: break pretrained_module = pretrained_module torch.manual_seed(1234) module = TransformerLayer.from_pretrained_module(pretrained_name) mapping = { val: key for key, val in module._construct_default_mapping( pretrained_module, "huggingface", {}).items() } assert_equal_parameters(pretrained_module, module, mapping=mapping) batch_size = 2 seq_len = 768 dim = module.attention.self.query.in_features hidden_states = torch.randn(batch_size, seq_len, dim) attention_mask = torch.randint(0, 2, (batch_size, seq_len)) mask_reshp = (batch_size, 1, 1, dim) attention_mask_hf = (attention_mask == 0).view(mask_reshp).expand( batch_size, 12, seq_len, seq_len) * -10e5 # setting to eval mode to avoid non-deterministic dropout. module = module.eval() pretrained_module = pretrained_module.eval() torch.manual_seed(1234) output = module.forward(hidden_states, attention_mask=attention_mask.squeeze())[0] torch.manual_seed(1234) hf_output = pretrained_module.forward( hidden_states, attention_mask=attention_mask_hf)[0] assert torch.allclose(output, hf_output, atol=1e-04)
def test_from_pretrained_avoids_weights_download_if_override_weights(self): config = AutoConfig.from_pretrained("epwalsh/bert-xsmall-dummy", cache_dir=self.TEST_DIR) # only download config because downloading pretrained weights in addition takes too long transformer = AutoModel.from_config( AutoConfig.from_pretrained("epwalsh/bert-xsmall-dummy", cache_dir=self.TEST_DIR)) transformer = AutoModel.from_config(config) # clear cache directory self.clear_test_dir() save_weights_path = str(self.TEST_DIR / "bert_weights.pth") torch.save(transformer.state_dict(), save_weights_path) override_transformer = cached_transformers.get( "epwalsh/bert-xsmall-dummy", False, override_weights_file=save_weights_path, cache_dir=self.TEST_DIR, ) # check that only three files were downloaded (filename.json, filename, filename.lock), for config.json # if more than three files were downloaded, then model weights were also (incorrectly) downloaded # NOTE: downloaded files are not explicitly detailed in Huggingface's public API, # so this assertion could fail in the future json_fnames = [ fname for fname in os.listdir(str(self.TEST_DIR)) if fname.endswith(".json") ] assert len(json_fnames) == 1 json_data = json.load(open(str(self.TEST_DIR / json_fnames[0]))) assert ( json_data["url"] == "https://huggingface.co/epwalsh/bert-xsmall-dummy/resolve/main/config.json" ) resource_id = os.path.splitext(json_fnames[0])[0] assert set(os.listdir(str(self.TEST_DIR))) == set([ json_fnames[0], resource_id, resource_id + ".lock", "bert_weights.pth" ]) # check that override weights were loaded correctly for p1, p2 in zip(transformer.parameters(), override_transformer.parameters()): assert p1.data.ne(p2.data).sum() == 0
def setup_method(self): super().setup_method() self.params_dict = { "hidden_size": 6, "intermediate_size": 3, "num_attention_heads": 2, "attention_dropout": 0.1, "hidden_dropout": 0.2, "activation": "relu", } params = Params(copy.deepcopy(self.params_dict)) self.transformer_layer = TransformerLayer.from_params(params) self.pretrained_name = "bert-base-uncased" self.pretrained = cached_transformers.get(self.pretrained_name, False)
def test_use_first_four_layers_of_pretrained(self): pretrained = cached_transformers.get("bert-base-uncased", False) class SmallTransformer(TokenEmbedder): def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( pretrained) self.transformer = TransformerStack.from_pretrained_module( pretrained, num_hidden_layers=4) @overrides def forward(self, token_ids: torch.LongTensor): x = self.embeddings(token_ids) x = self.transformer(x) return x small = SmallTransformer() assert len(small.transformer.layers) == 4 small.forward(torch.LongTensor([[0, 1, 2]]))
def test_from_pretrained_no_load_weights(self): _ = cached_transformers.get("epwalsh/bert-xsmall-dummy", False, load_weights=False, cache_dir=self.TEST_DIR) # check that only three files were downloaded (filename.json, filename, filename.lock), for config.json # if more than three files were downloaded, then model weights were also (incorrectly) downloaded # NOTE: downloaded files are not explicitly detailed in Huggingface's public API, # so this assertion could fail in the future json_fnames = [ fname for fname in os.listdir(str(self.TEST_DIR)) if fname.endswith(".json") ] assert len(json_fnames) == 1 json_data = json.load(open(str(self.TEST_DIR / json_fnames[0]))) assert ( json_data["url"] == "https://huggingface.co/epwalsh/bert-xsmall-dummy/resolve/main/config.json" ) resource_id = os.path.splitext(json_fnames[0])[0] assert set(os.listdir(str(self.TEST_DIR))) == set( [json_fnames[0], resource_id, resource_id + ".lock"])
def test_loading_from_pretrained(pretrained_model_name): transformer_stack = TransformerStack.from_pretrained_module( pretrained_model_name).eval() pretrained_module = cached_transformers.get(pretrained_model_name, True).encoder.eval() batch_size = 2 seq_length = 15 hidden_size = transformer_stack.layers[0]._hidden_size hidden_states = torch.randn(batch_size, seq_length, hidden_size) attention_mask = torch.randint(0, 2, (batch_size, seq_length)) attention_mask_hf = attention_mask[:, None, None, :] attention_mask_hf = (1.0 - attention_mask_hf) * -10e5 torch.manual_seed(SEED) output = transformer_stack(hidden_states, attention_mask=attention_mask) torch.manual_seed(SEED) hf_output = pretrained_module(hidden_states, attention_mask=attention_mask_hf) assert torch.allclose(output.final_hidden_states, hf_output[0])
def test_end_to_end(self, model_name: str): data = [ ("I'm against picketing", "but I don't know how to show it."), ("I saw a human pyramid once.", "It was very unnecessary."), ] tokenizer = cached_transformers.get_tokenizer(model_name) batch = tokenizer.batch_encode_plus(data, padding=True, return_tensors="pt") with torch.no_grad(): huggingface_model = cached_transformers.get( model_name, make_copy=False).eval() huggingface_output = huggingface_model(**batch) embeddings = TransformerEmbeddings.from_pretrained_module( model_name).eval() transformer_stack = TransformerStack.from_pretrained_module( model_name).eval() pooler = TransformerPooler.from_pretrained_module( model_name).eval() batch["attention_mask"] = batch["attention_mask"].to(torch.bool) output = embeddings(**batch) output = transformer_stack(output, batch["attention_mask"]) assert_allclose( output.final_hidden_states, huggingface_output.last_hidden_state, rtol=0.0001, atol=1e-4, ) output = pooler(output.final_hidden_states) assert_allclose(output, huggingface_output.pooler_output, rtol=0.0001, atol=1e-4)
def __init__( self, model_name: str, *, max_length: int = None, sub_module: str = None, train_parameters: bool = True, eval_mode: bool = False, last_layer_only: bool = True, override_weights_file: Optional[str] = None, override_weights_strip_prefix: Optional[str] = None, gradient_checkpointing: Optional[bool] = None, tokenizer_kwargs: Optional[Dict[str, Any]] = None, transformer_kwargs: Optional[Dict[str, Any]] = None, ) -> None: super().__init__() from allennlp.common import cached_transformers self.transformer_model = cached_transformers.get( model_name, True, override_weights_file=override_weights_file, override_weights_strip_prefix=override_weights_strip_prefix, **(transformer_kwargs or {}), ) if gradient_checkpointing is not None: self.transformer_model.config.update({"gradient_checkpointing": gradient_checkpointing}) self.config = self.transformer_model.config if sub_module: assert hasattr(self.transformer_model, sub_module) self.transformer_model = getattr(self.transformer_model, sub_module) self._max_length = max_length # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.config.hidden_size self._scalar_mix: Optional[ScalarMix] = None if not last_layer_only: self._scalar_mix = ScalarMix(self.config.num_hidden_layers) self.config.output_hidden_states = True tokenizer = PretrainedTransformerTokenizer( model_name, tokenizer_kwargs=tokenizer_kwargs, ) try: if self.transformer_model.get_input_embeddings().num_embeddings != len( tokenizer.tokenizer ): self.transformer_model.resize_token_embeddings(len(tokenizer.tokenizer)) except NotImplementedError: # Can't resize for transformers models that don't implement base_model.get_input_embeddings() logger.warning( "Could not resize the token embedding matrix of the transformer model. " "This model does not support resizing." ) self._num_added_start_tokens = len(tokenizer.single_sequence_start_tokens) self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens) self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens self.train_parameters = train_parameters if not train_parameters: for param in self.transformer_model.parameters(): param.requires_grad = False self.eval_mode = eval_mode if eval_mode: self.transformer_model.eval()
def test_use_selected_layers_of_bert_for_different_purposes(self): class MediumTransformer(torch.nn.Module): def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( "bert-base-cased", relevant_module="bert.embeddings") self.separate_transformer = TransformerStack.from_pretrained_module( "bert-base-cased", relevant_module="bert.encoder", num_hidden_layers=8, strict=False, ) self.combined_transformer = TransformerStack.from_pretrained_module( "bert-base-cased", relevant_module="bert.encoder", num_hidden_layers=4, mapping={ f"layer.{l}": f"layers.{i}" for (i, l) in enumerate(range(8, 12)) }, strict=False, ) @overrides def forward( self, left_token_ids: torch.LongTensor, right_token_ids: torch.LongTensor, ): left = self.embeddings(left_token_ids) left = self.separate_transformer(left) right = self.embeddings(right_token_ids) right = self.separate_transformer(right) # combine the sequences in some meaningful way. here, we just add them. # combined = combine_masked_sequences(left, left_mask, right, right_mask) combined = left + right return self.combined_transformer(combined) medium = MediumTransformer() assert (len(medium.separate_transformer.layers)) == 8 assert (len(medium.combined_transformer.layers)) == 4 pretrained = cached_transformers.get("bert-base-cased", False) pretrained_layers = dict(pretrained.encoder.layer.named_modules()) separate_layers = dict( medium.separate_transformer.layers.named_modules()) assert_allclose( separate_layers["0"].intermediate.dense.weight.data, pretrained_layers["0"].intermediate.dense.weight.data, ) combined_layers = dict( medium.combined_transformer.layers.named_modules()) assert_allclose( combined_layers["0"].intermediate.dense.weight.data, pretrained_layers["8"].intermediate.dense.weight.data, ) assert_allclose( combined_layers["1"].intermediate.dense.weight.data, pretrained_layers["9"].intermediate.dense.weight.data, ) assert_allclose( combined_layers["2"].intermediate.dense.weight.data, pretrained_layers["10"].intermediate.dense.weight.data, ) assert_allclose( combined_layers["3"].intermediate.dense.weight.data, pretrained_layers["11"].intermediate.dense.weight.data, )
def __init__( self, model_name: str, *, max_length: int = None, sub_module: str = None, train_parameters: bool = True, last_layer_only: bool = True, override_weights_file: Optional[str] = None, override_weights_strip_prefix: Optional[str] = None, gradient_checkpointing: Optional[bool] = None, tokenizer_kwargs: Optional[Dict[str, Any]] = None, transformer_kwargs: Optional[Dict[str, Any]] = None, masked_language_modeling: bool = True, load_directory: Optional[str] = None ) -> None: TokenEmbedder.__init__(self) # Call the base class constructor tokenizer = PretrainedTransformerTokenizer(model_name, tokenizer_kwargs=tokenizer_kwargs) self.masked_language_modeling = masked_language_modeling if self.masked_language_modeling: self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True) # We only need access to the HF tokenizer if we are masked language modeling self.tokenizer = tokenizer.tokenizer # The only differences when masked language modeling are: # 1) `output_hidden_states` must be True to get access to token embeddings. # 2) We need to use `AutoModelForMaskedLM` to get the correct model self.transformer_model = AutoModelForMaskedLM.from_pretrained( # self.transformer_model = RobertaForAugment.from_pretrained() model_name, config=self.config, **(transformer_kwargs or {}) ) if load_directory is not None: print("Loading Model from:", load_directory) state = torch.load(load_directory) model_dict = self.transformer_model.state_dict() # ckpt__dict = state['state_dict'] state = {k: v for k, v in state.items() if k in model_dict} model_dict.update(state) self.transformer_model.load_state_dict(model_dict, strict=False) print("Loading Model from:", load_directory, "...Finished.") # Eveything after the if statement (including the else) is copied directly from: # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/pretrained_transformer_embedder.py else: from allennlp.common import cached_transformers self.transformer_model = cached_transformers.get( model_name, True, override_weights_file, override_weights_strip_prefix ) self.config = self.transformer_model.config if gradient_checkpointing is not None: self.transformer_model.config.update({"gradient_checkpointing": gradient_checkpointing}) if sub_module: assert hasattr(self.transformer_model, sub_module) self.transformer_model = getattr(self.transformer_model, sub_module) # print("max_length", max_length) self._max_length = max_length # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.config.hidden_size self._scalar_mix: Optional[ScalarMix] = None if not last_layer_only: self._scalar_mix = ScalarMix(self.config.num_hidden_layers) self.config.output_hidden_states = True self._num_added_start_tokens = len(tokenizer.single_sequence_start_tokens) self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens) self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens self.encoder = BertEncoder(self.config) self.layer = torch.nn.ModuleList([BertLayer(self.config) for _ in range(self.config.num_hidden_layers)]) self.embeddings = BertEmbeddings(self.config) self.output_hidden_states = self.config.output_hidden_states if not train_parameters: for param in self.transformer_model.parameters(): param.requires_grad = False