def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None: """ Copies tokens from ```transformers``` model's vocab to the specified namespace. """ if self._added_to_vocabulary: return vocab.add_transformer_vocab(self._tokenizer, self._namespace) self._added_to_vocabulary = True
def __init__( self, vocab: Vocabulary, embedding_dim: int, feedforward_dim: int, num_layers: int, num_attention_heads: int, position_embedding_dim: int, tokenizer_path: str, position_embedding_type: str = "absolute", activation: str = "gelu", hidden_dropout: float = 0.1, ) -> None: super().__init__() # TODO: # - Need to apply corrections in pretrained_transformer_mismatched_embedder tokenizer = BertTokenizer.from_pretrained(tokenizer_path) vocab.add_transformer_vocab(tokenizer, "tokens") # "tokens" is padded by default--undo that del vocab._token_to_index["tokens"]["@@PADDING@@"] del vocab._token_to_index["tokens"]["@@UNKNOWN@@"] assert len(vocab._token_to_index["tokens"]) == len(vocab._index_to_token["tokens"]) cfg = BertConfig( vocab_size=vocab.get_vocab_size("tokens"), hidden_size=embedding_dim, num_hidden_layers=num_layers, num_attention_heads=num_attention_heads, intermediate_size=feedforward_dim, hidden_act=activation, hidden_dropout_prob=hidden_dropout, max_position_embeddings=position_embedding_dim, position_embedding_type=position_embedding_type, use_cache=True, ) self.cfg = cfg self._vocab = vocab self._namespace = "tokens" self.bert = BertModel(cfg) self.masking_collator = DataCollatorForWholeWordMask( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 )