Beispiel #1
0
class RoBERTaEncoder(RoBERTaEncoderBase):
    """A PyTorch RoBERTa implementation"""
    class Config(RoBERTaEncoderBase.Config):
        num_encoder_layers: int = 12
        num_attention_heads: int = 12
        model_path: str = "manifold://pytext_training/tree/static/models/roberta_base_torch.pt"

    def __init__(self, config: Config, output_encoded_layers: bool,
                 **kwarg) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)
        # assert config.pretrained_encoder.load_path, "Load path cannot be empty."
        self.encoder = SentenceEncoder(transformer=Transformer(
            embedding_dim=config.embedding_dim,
            layers=[
                TransformerLayer(
                    embedding_dim=config.embedding_dim,
                    attention=MultiheadSelfAttention(
                        config.embedding_dim, config.num_attention_heads),
                ) for _ in range(config.num_encoder_layers)
            ],
        ))
        roberta_state = torch.load(
            config.model_path,
            map_location=lambda s, l: default_restore_location(s, "cpu"),
        )
        self.encoder.load_roberta_state_dict(roberta_state["model"])
        self.representation_dim = self.encoder.transformer.token_embedding.weight.size(
            -1)
Beispiel #2
0
    def __init__(self, config: Config, output_encoded_layers: bool,
                 **kwarg) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)
        # assert config.pretrained_encoder.load_path, "Load path cannot be empty."
        self.encoder = SentenceEncoder(transformer=Transformer(
            vocab_size=config.vocab_size,
            embedding_dim=config.embedding_dim,
            layers=[
                TransformerLayer(
                    embedding_dim=config.embedding_dim,
                    attention=MultiheadSelfAttention(
                        config.embedding_dim, config.num_attention_heads),
                ) for _ in range(config.num_encoder_layers)
            ],
        ))
        self.apply(init_params)
        if config.model_path:
            with PathManager.open(config.model_path, "rb") as f:
                roberta_state = torch.load(f,
                                           map_location=lambda s, l:
                                           default_restore_location(s, "cpu"))
            # In case the model has previously been loaded in PyText and finetuned,
            # then we dont need to do the special state dict translation. Load
            # it directly
            if not config.is_finetuned:
                self.encoder.load_roberta_state_dict(roberta_state["model"])
            else:
                self.load_state_dict(roberta_state)

        self.representation_dim = self._embedding().weight.size(-1)
        log_class_usage(__class__)
Beispiel #3
0
class RoBERTaEncoder(RoBERTaEncoderBase):
    """A PyTorch RoBERTa implementation"""
    class Config(RoBERTaEncoderBase.Config):
        embedding_dim: int = 768
        vocab_size: int = 50265
        num_encoder_layers: int = 12
        num_attention_heads: int = 12
        model_path: str = (
            "manifold://pytext_training/tree/static/models/roberta_base_torch.pt"
        )
        # Loading the state dict of the model depends on whether the model was
        # previously finetuned in PyText or not. If it was finetuned then we
        # dont need to translate the state dict and can just load it`
        # directly.
        is_finetuned: bool = False

    def __init__(self, config: Config, output_encoded_layers: bool,
                 **kwarg) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)

        # map to the real model_path
        config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path]
                             if config.model_path
                             in resources.roberta.RESOURCE_MAP else
                             config.model_path)
        # assert config.pretrained_encoder.load_path, "Load path cannot be empty."
        self.encoder = SentenceEncoder(transformer=Transformer(
            vocab_size=config.vocab_size,
            embedding_dim=config.embedding_dim,
            layers=[
                TransformerLayer(
                    embedding_dim=config.embedding_dim,
                    attention=MultiheadSelfAttention(
                        config.embedding_dim, config.num_attention_heads),
                ) for _ in range(config.num_encoder_layers)
            ],
        ))
        self.apply(init_params)
        if config.model_path:
            with PathManager.open(config.model_path, "rb") as f:
                roberta_state = torch.load(f,
                                           map_location=lambda s, l:
                                           default_restore_location(s, "cpu"))
            # In case the model has previously been loaded in PyText and finetuned,
            # then we dont need to do the special state dict translation. Load
            # it directly
            if not config.is_finetuned:
                self.encoder.load_roberta_state_dict(roberta_state["model"])
            else:
                self.load_state_dict(roberta_state)

        self.representation_dim = self._embedding().weight.size(-1)
        log_class_usage(__class__)

    def _embedding(self):
        # used to tie weights in MaskedLM model
        return self.encoder.transformer.token_embedding
Beispiel #4
0
    def __init__(self, config: Config, output_encoded_layers: bool,
                 **kwarg) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)

        # map to the real model_path
        config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path]
                             if config.model_path
                             in resources.roberta.RESOURCE_MAP else
                             config.model_path)
        # assert config.pretrained_encoder.load_path, "Load path cannot be empty."
        # sharing compression across each layers

        # create compress layer if use linear multihead attention
        if config.use_linformer_encoder:
            compress_layer = nn.Linear(
                config.max_seq_len - 2,
                (config.max_seq_len - 2) // config.linformer_compressed_ratio,
            )

        self.encoder = SentenceEncoder(transformer=Transformer(
            vocab_size=config.vocab_size,
            embedding_dim=config.embedding_dim,
            layers=[
                TransformerLayer(
                    embedding_dim=config.embedding_dim,
                    attention=MultiheadLinearAttention(
                        embed_dim=config.embedding_dim,
                        num_heads=config.num_attention_heads,
                        compress_layer=compress_layer,
                    ) if config.
                    use_linformer_encoder else MultiheadSelfAttention(
                        embed_dim=config.embedding_dim,
                        num_heads=config.num_attention_heads,
                    ),
                ) for _ in range(config.num_encoder_layers)
            ],
            max_seq_len=config.max_seq_len,
        ))
        self.apply(init_params)
        if config.model_path:
            with PathManager.open(config.model_path, "rb") as f:
                roberta_state = torch.load(f,
                                           map_location=lambda s, l:
                                           default_restore_location(s, "cpu"))
            # In case the model has previously been loaded in PyText and finetuned,
            # then we dont need to do the special state dict translation. Load
            # it directly
            if not config.is_finetuned:
                self.encoder.load_roberta_state_dict(roberta_state["model"])
            else:
                self.load_state_dict(roberta_state)

        self.representation_dim = self._embedding().weight.size(-1)
        self.export_encoder = config.export_encoder
        self.variable_size_embedding = config.variable_size_embedding
        log_class_usage(__class__)
Beispiel #5
0
 def _small_encoder(self):
     layers = [
         TransformerLayer(
             embedding_dim=12,
             attention=MultiheadSelfAttention(
                 embed_dim=12, num_heads=12, scaling=0.125
             ),
         )
         for _ in range(2)
     ]
     transformer = Transformer(vocab_size=100, embedding_dim=12, layers=layers)
     return SentenceEncoder(transformer)
Beispiel #6
0
 def __init__(self, config: Config, output_encoded_layers: bool,
              **kwarg) -> None:
     super().__init__(config, output_encoded_layers=output_encoded_layers)
     # assert config.pretrained_encoder.load_path, "Load path cannot be empty."
     self.encoder = SentenceEncoder(transformer=Transformer(
         embedding_dim=config.embedding_dim,
         layers=[
             TransformerLayer(
                 embedding_dim=config.embedding_dim,
                 attention=MultiheadSelfAttention(
                     config.embedding_dim, config.num_attention_heads),
             ) for _ in range(config.num_encoder_layers)
         ],
     ))
     roberta_state = torch.load(
         config.model_path,
         map_location=lambda s, l: default_restore_location(s, "cpu"),
     )
     self.encoder.load_roberta_state_dict(roberta_state["model"])
     self.representation_dim = self.encoder.transformer.token_embedding.weight.size(
         -1)
Beispiel #7
0
    def __init__(self, config: Config, output_encoded_layers: bool,
                 **kwarg) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)

        # map to the real model_path
        config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path]
                             if config.model_path
                             in resources.roberta.RESOURCE_MAP else
                             config.model_path)
        # assert config.pretrained_encoder.load_path, "Load path cannot be empty."
        # sharing compression across each layers

        # create compress layer if use linear multihead attention
        if config.use_linformer_encoder:
            compress_layer = nn.Linear(
                config.max_seq_len - 2,
                (config.max_seq_len - 2) // config.linformer_compressed_ratio,
            )

        self.use_selfie_encoder = config.use_selfie_encoder

        if config.use_linformer_encoder:
            if config.linformer_quantize:
                layers = [
                    TransformerLayer(
                        embedding_dim=config.embedding_dim,
                        attention=QuantizedMultiheadLinearAttention(
                            embed_dim=config.embedding_dim,
                            num_heads=config.num_attention_heads,
                            compress_layer=compress_layer,
                        ),
                    ) for _ in range(config.num_encoder_layers)
                ]
            else:
                layers = [
                    TransformerLayer(
                        embedding_dim=config.embedding_dim,
                        attention=MultiheadLinearAttention(
                            embed_dim=config.embedding_dim,
                            num_heads=config.num_attention_heads,
                            compress_layer=compress_layer,
                        ),
                    ) for _ in range(config.num_encoder_layers)
                ]
        else:
            layers = [
                TransformerLayer(
                    embedding_dim=config.embedding_dim,
                    attention=MultiheadSelfAttention(
                        embed_dim=config.embedding_dim,
                        num_heads=config.num_attention_heads,
                    ),
                ) for _ in range(config.num_encoder_layers)
            ]

        self.encoder = (SentenceEncoder(transformer=Transformer(
            vocab_size=config.vocab_size,
            embedding_dim=config.embedding_dim,
            layers=layers,
            max_seq_len=config.max_seq_len,
        )) if not self.use_selfie_encoder else PostEncoder(
            transformer=SELFIETransformer(
                vocab_size=config.vocab_size,
                embedding_dim=config.embedding_dim,
                layers=layers,
                max_seq_len=config.max_seq_len,
            )))
        self.apply(init_params)
        if config.model_path:
            with PathManager.open(config.model_path, "rb") as f:
                roberta_state = torch.load(f,
                                           map_location=lambda s, l:
                                           default_restore_location(s, "cpu"))
            # In case the model has previously been loaded in PyText and finetuned,
            # then we dont need to do the special state dict translation. Load
            # it directly
            if not config.is_finetuned:
                self.encoder.load_roberta_state_dict(roberta_state["model"])
            else:
                self.load_state_dict(roberta_state)

        if config.use_bias_finetuning:
            for (n, p) in self.encoder.named_parameters():
                # "encoder.transformer.layers.0.attention.input_projection.weight" -> false
                # "encoder.transformer.layers.0.attention.input_projection.bias" -> true
                if n.split(".")[-1] != "bias":
                    p.requires_grad_(False)

        self.export_encoder = config.export_encoder
        self.variable_size_embedding = config.variable_size_embedding
        self.use_linformer_encoder = config.use_linformer_encoder
        log_class_usage(__class__)
Beispiel #8
0
 def _small_encoder(self):
     layers = [TransformerLayer(embedding_dim=12) for _ in range(2)]
     transformer = Transformer(vocab_size=100,
                               embedding_dim=12,
                               layers=layers)
     return SentenceEncoder(transformer)