class RoBERTaEncoder(RoBERTaEncoderBase): """A PyTorch RoBERTa implementation""" class Config(RoBERTaEncoderBase.Config): num_encoder_layers: int = 12 num_attention_heads: int = 12 model_path: str = "manifold://pytext_training/tree/static/models/roberta_base_torch.pt" def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." self.encoder = SentenceEncoder(transformer=Transformer( embedding_dim=config.embedding_dim, layers=[ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadSelfAttention( config.embedding_dim, config.num_attention_heads), ) for _ in range(config.num_encoder_layers) ], )) roberta_state = torch.load( config.model_path, map_location=lambda s, l: default_restore_location(s, "cpu"), ) self.encoder.load_roberta_state_dict(roberta_state["model"]) self.representation_dim = self.encoder.transformer.token_embedding.weight.size( -1)
def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." self.encoder = SentenceEncoder(transformer=Transformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=[ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadSelfAttention( config.embedding_dim, config.num_attention_heads), ) for _ in range(config.num_encoder_layers) ], )) self.apply(init_params) if config.model_path: with PathManager.open(config.model_path, "rb") as f: roberta_state = torch.load(f, map_location=lambda s, l: default_restore_location(s, "cpu")) # In case the model has previously been loaded in PyText and finetuned, # then we dont need to do the special state dict translation. Load # it directly if not config.is_finetuned: self.encoder.load_roberta_state_dict(roberta_state["model"]) else: self.load_state_dict(roberta_state) self.representation_dim = self._embedding().weight.size(-1) log_class_usage(__class__)
class RoBERTaEncoder(RoBERTaEncoderBase): """A PyTorch RoBERTa implementation""" class Config(RoBERTaEncoderBase.Config): embedding_dim: int = 768 vocab_size: int = 50265 num_encoder_layers: int = 12 num_attention_heads: int = 12 model_path: str = ( "manifold://pytext_training/tree/static/models/roberta_base_torch.pt" ) # Loading the state dict of the model depends on whether the model was # previously finetuned in PyText or not. If it was finetuned then we # dont need to translate the state dict and can just load it` # directly. is_finetuned: bool = False def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # map to the real model_path config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path] if config.model_path in resources.roberta.RESOURCE_MAP else config.model_path) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." self.encoder = SentenceEncoder(transformer=Transformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=[ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadSelfAttention( config.embedding_dim, config.num_attention_heads), ) for _ in range(config.num_encoder_layers) ], )) self.apply(init_params) if config.model_path: with PathManager.open(config.model_path, "rb") as f: roberta_state = torch.load(f, map_location=lambda s, l: default_restore_location(s, "cpu")) # In case the model has previously been loaded in PyText and finetuned, # then we dont need to do the special state dict translation. Load # it directly if not config.is_finetuned: self.encoder.load_roberta_state_dict(roberta_state["model"]) else: self.load_state_dict(roberta_state) self.representation_dim = self._embedding().weight.size(-1) log_class_usage(__class__) def _embedding(self): # used to tie weights in MaskedLM model return self.encoder.transformer.token_embedding
def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # map to the real model_path config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path] if config.model_path in resources.roberta.RESOURCE_MAP else config.model_path) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." # sharing compression across each layers # create compress layer if use linear multihead attention if config.use_linformer_encoder: compress_layer = nn.Linear( config.max_seq_len - 2, (config.max_seq_len - 2) // config.linformer_compressed_ratio, ) self.encoder = SentenceEncoder(transformer=Transformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=[ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadLinearAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, compress_layer=compress_layer, ) if config. use_linformer_encoder else MultiheadSelfAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, ), ) for _ in range(config.num_encoder_layers) ], max_seq_len=config.max_seq_len, )) self.apply(init_params) if config.model_path: with PathManager.open(config.model_path, "rb") as f: roberta_state = torch.load(f, map_location=lambda s, l: default_restore_location(s, "cpu")) # In case the model has previously been loaded in PyText and finetuned, # then we dont need to do the special state dict translation. Load # it directly if not config.is_finetuned: self.encoder.load_roberta_state_dict(roberta_state["model"]) else: self.load_state_dict(roberta_state) self.representation_dim = self._embedding().weight.size(-1) self.export_encoder = config.export_encoder self.variable_size_embedding = config.variable_size_embedding log_class_usage(__class__)
def _small_encoder(self): layers = [ TransformerLayer( embedding_dim=12, attention=MultiheadSelfAttention( embed_dim=12, num_heads=12, scaling=0.125 ), ) for _ in range(2) ] transformer = Transformer(vocab_size=100, embedding_dim=12, layers=layers) return SentenceEncoder(transformer)
def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." self.encoder = SentenceEncoder(transformer=Transformer( embedding_dim=config.embedding_dim, layers=[ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadSelfAttention( config.embedding_dim, config.num_attention_heads), ) for _ in range(config.num_encoder_layers) ], )) roberta_state = torch.load( config.model_path, map_location=lambda s, l: default_restore_location(s, "cpu"), ) self.encoder.load_roberta_state_dict(roberta_state["model"]) self.representation_dim = self.encoder.transformer.token_embedding.weight.size( -1)
def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # map to the real model_path config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path] if config.model_path in resources.roberta.RESOURCE_MAP else config.model_path) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." # sharing compression across each layers # create compress layer if use linear multihead attention if config.use_linformer_encoder: compress_layer = nn.Linear( config.max_seq_len - 2, (config.max_seq_len - 2) // config.linformer_compressed_ratio, ) self.use_selfie_encoder = config.use_selfie_encoder if config.use_linformer_encoder: if config.linformer_quantize: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=QuantizedMultiheadLinearAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, compress_layer=compress_layer, ), ) for _ in range(config.num_encoder_layers) ] else: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadLinearAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, compress_layer=compress_layer, ), ) for _ in range(config.num_encoder_layers) ] else: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadSelfAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, ), ) for _ in range(config.num_encoder_layers) ] self.encoder = (SentenceEncoder(transformer=Transformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=layers, max_seq_len=config.max_seq_len, )) if not self.use_selfie_encoder else PostEncoder( transformer=SELFIETransformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=layers, max_seq_len=config.max_seq_len, ))) self.apply(init_params) if config.model_path: with PathManager.open(config.model_path, "rb") as f: roberta_state = torch.load(f, map_location=lambda s, l: default_restore_location(s, "cpu")) # In case the model has previously been loaded in PyText and finetuned, # then we dont need to do the special state dict translation. Load # it directly if not config.is_finetuned: self.encoder.load_roberta_state_dict(roberta_state["model"]) else: self.load_state_dict(roberta_state) if config.use_bias_finetuning: for (n, p) in self.encoder.named_parameters(): # "encoder.transformer.layers.0.attention.input_projection.weight" -> false # "encoder.transformer.layers.0.attention.input_projection.bias" -> true if n.split(".")[-1] != "bias": p.requires_grad_(False) self.export_encoder = config.export_encoder self.variable_size_embedding = config.variable_size_embedding self.use_linformer_encoder = config.use_linformer_encoder log_class_usage(__class__)
def _small_encoder(self): layers = [TransformerLayer(embedding_dim=12) for _ in range(2)] transformer = Transformer(vocab_size=100, embedding_dim=12, layers=layers) return SentenceEncoder(transformer)