Ejemplo n.º 1
0
    def __init__(self, args: argparse.Namespace):
        """Initialize a model, tokenizer and config."""
        super().__init__()
        self.args = args
        if isinstance(args, argparse.Namespace):
            self.save_hyperparameters(args)
        self.tokenizer = RobertaTokenizer.from_pretrained(
            self.args.roberta_path)
        self.model = RobertaForMaskedLM.from_pretrained(self.args.roberta_path)

        self.robert_config = RobertaConfig.from_pretrained(
            self.args.roberta_path, output_hidden_states=False)
        self.model = RobertaForMaskedLM(self.robert_config)

        self.loss_fn = CrossEntropyLoss(reduction="none")
        self.acc = MaskedAccuracy(num_classes=self.tokenizer.vocab_size)
Ejemplo n.º 2
0
def convert_pytorch_to_roberta_checkpoint(pytorch_checkpoint_path: str,
                                          roberta_dump_folder_path: str):
    """
    Copy/paste/tweak roberta's weights to our BERT structure.
    """
    import pickle
    model = RobertaForMaskedLM.from_pretrained(pytorch_checkpoint_path)
    config = RobertaConfig.from_pretrained(pytorch_checkpoint_path)
    from argparse import Namespace

    huggingface_train_args = Namespace(
        **vars(torch.load(f"{pytorch_checkpoint_path}/training_args.bin")))
    model.eval()  # disable dropout

    # tokenizer = RobertaTokenizer.from_pretrained(roberta_checkpoint_path)
    if config.num_hidden_layers == 12:
        roberta = FairseqRobertaModel.from_pretrained("roberta.base")
    elif config.num_hidden_layers == 24:
        roberta = FairseqRobertaModel.from_pretrained("roberta.large")
    else:
        raise Exception("Only roberta LM is supported!")
    roberta.eval()
    # roberta_sent_encoder = roberta.model.decoder.sentence_encoder

    # update config from huggingface and reuse lots of settings from fairseq pretrained
    roberta.args.warmup_updates = huggingface_train_args.warmup_steps
    roberta.args.weight_decay = huggingface_train_args.weight_decay
    roberta.args.adam_eps = huggingface_train_args.adam_epsilon
    roberta.args.clip_norm = huggingface_train_args.max_grad_norm
    roberta.args.max_update = huggingface_train_args.max_steps
    roberta.args.total_num_update = huggingface_train_args.max_steps
    roberta.args.save_interval_updates = huggingface_train_args.save_steps

    roberta.args.attention_dropout = config.attention_probs_dropout_prob
    roberta.args.encoder_embed_dim = config.hidden_size
    roberta.args.encoder_ffn_embed_dim = config.intermediate_size
    roberta.args.activation_fn = config.hidden_act
    roberta.args.activation_dropout = config.hidden_dropout_prob
    roberta.args.encoder_layers = config.num_hidden_layers
    roberta.args.encoder_attention_heads = config.num_attention_heads
    roberta.args.__dict__.update(huggingface_train_args.__dict__)

    roberta.model.decoder.sentence_encoder.embed_tokens.weight = model.roberta.embeddings.word_embeddings.weight
    roberta.model.decoder.sentence_encoder.embed_positions.weight = model.roberta.embeddings.position_embeddings.weight
    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        model.roberta.embeddings.token_type_embeddings.weight
    )  # just zero them out b/c RoBERTa doesn't use them.
    roberta.model.decoder.sentence_encoder.emb_layer_norm.weight = model.roberta.embeddings.LayerNorm.weight
    roberta.model.decoder.sentence_encoder.emb_layer_norm.bias = model.roberta.embeddings.LayerNorm.bias

    for i in range(config.num_hidden_layers):
        # Encoder: start of layer
        layer: BertLayer = model.roberta.encoder.layer[i]
        # roberta.model.decoder.sentence_encoder.layers[i]: TransformerSentenceEncoderLayer = roberta.model.decoder.sentence_encoder.layers[i]

        # self attention
        self_attn: BertSelfAttention = layer.attention.self
        assert (roberta.model.decoder.sentence_encoder.layers[i].self_attn.
                k_proj.weight.data.shape == roberta.model.decoder.
                sentence_encoder.layers[i].self_attn.q_proj.weight.data.shape
                == roberta.model.decoder.sentence_encoder.layers[i].self_attn.
                v_proj.weight.data.shape == torch.Size(
                    (config.hidden_size, config.hidden_size)))

        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.q_proj.weight = self_attn.query.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.q_proj.bias = self_attn.query.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.k_proj.weight = self_attn.key.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.k_proj.bias = self_attn.key.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.v_proj.weight = self_attn.value.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.v_proj.bias = self_attn.value.bias

        # self-attention output
        self_output: BertSelfOutput = layer.attention.output
        assert self_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.out_proj.weight.shape
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.out_proj.weight = self_output.dense.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.out_proj.bias = self_output.dense.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn_layer_norm.weight = self_output.LayerNorm.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn_layer_norm.bias = self_output.LayerNorm.bias

        # intermediate
        intermediate: BertIntermediate = layer.intermediate
        assert intermediate.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[
            i].fc1.weight.shape
        roberta.model.decoder.sentence_encoder.layers[
            i].fc1.weight = intermediate.dense.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].fc1.bias = intermediate.dense.bias

        # output
        bert_output: BertOutput = layer.output
        assert bert_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[
            i].fc2.weight.shape
        roberta.model.decoder.sentence_encoder.layers[
            i].fc2.weight = bert_output.dense.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].fc2.bias = bert_output.dense.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].final_layer_norm.weight = bert_output.LayerNorm.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].final_layer_norm.bias = bert_output.LayerNorm.bias

    # LM Head
    roberta.model.decoder.lm_head.dense.weight = model.lm_head.dense.weight
    roberta.model.decoder.lm_head.dense.bias = model.lm_head.dense.bias
    roberta.model.decoder.lm_head.layer_norm.weight = model.lm_head.layer_norm.weight
    roberta.model.decoder.lm_head.layer_norm.bias = model.lm_head.layer_norm.bias
    roberta.model.decoder.lm_head.weight = model.lm_head.decoder.weight
    roberta.model.decoder.lm_head.bias = model.lm_head.decoder.bias

    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(
        0)  # batch of size 1
    their_output = model(input_ids)[0]
    our_output = roberta.model(input_ids)[0]

    print(our_output.shape, their_output.shape)
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
    copy_success = torch.allclose(our_output, their_output, atol=1e-3)
    print("Do both models output the same tensors?",
          "🔥" if copy_success else "💩")
    if not copy_success:
        raise Exception("Something went wRoNg")

    pathlib.Path(roberta_dump_folder_path).mkdir(parents=True, exist_ok=True)
    print(f"Saving model to {roberta_dump_folder_path}")
    from fairseq import checkpoint_utils
    state_dict = {
        "args":
        roberta.args,
        "model":
        roberta.model.state_dict(),
        # these last two were copied from fairseq pretrained just to make .from_pretrain() function works
        "extra_state": {
            'train_iterator': {
                'epoch': 0
            },
            'val_loss': 1.4955725940408326
        },
        "optimizer_history": [{
            'criterion_name': 'MaskedLmLoss',
            'optimizer_name': 'MemoryEfficientFP16Optimizer',
            'lr_scheduler_state': {
                'best': 1.495530066777925
            },
            'num_updates': 500000
        }]
    }
    from fairseq import checkpoint_utils
    # checkpoint_utils.save_state(f"{roberta_dump_folder_path}/model.pt", roberta.args, roberta.state_dict(), )
    # del model
    checkpoint_utils.torch_persistent_save(
        state_dict, f"{roberta_dump_folder_path}/model.pt")
    loaded_model = FairseqRobertaModel.from_pretrained(
        roberta_dump_folder_path)
    loaded_model.eval()

    # roberta.model(input_ids)
    # loaded_model.model(input_ids)

    del state_dict
    copied_dict = roberta.state_dict()
    loaded_dict = loaded_model.state_dict()
    assert loaded_model.state_dict().keys() == roberta.state_dict().keys()
    for k in roberta.state_dict().keys():
        loaded_val = loaded_dict[k]
        copied_val = copied_dict[k]
        if not torch.allclose(loaded_val, copied_val, atol=1e-3):
            print(k)
    loaded_output = loaded_model.model(input_ids)[0]
    save_success = torch.allclose(our_output, loaded_output, atol=1e-3)
    print("Do both models output the same tensors?",
          "🔥" if save_success else "💩")
    if not save_success:
        raise Exception("Something went wRoNg")
    # except:
    #     print("Fail to save")
    # torch.save(roberta, f"{roberta_dump_folder_path}/model.pt")
    print("Done")
Ejemplo n.º 3
0
def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path: str,
                                          pytorch_dump_folder_path: str,
                                          classification_head: bool):
    """
    Copy/paste/tweak roberta's weights to our BERT structure.
    """
    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
    roberta.eval()  # disable dropout
    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
    config = RobertaConfig(
        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
        hidden_size=roberta.args.encoder_embed_dim,
        num_hidden_layers=roberta.args.encoder_layers,
        num_attention_heads=roberta.args.encoder_attention_heads,
        intermediate_size=roberta.args.encoder_ffn_embed_dim,
        max_position_embeddings=514,
        type_vocab_size=1,
        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
    )
    if classification_head:
        config.num_labels = roberta.args.num_classes
    print("Our BERT config:", config)

    model = RobertaForSequenceClassification(
        config) if classification_head else RobertaForMaskedLM(config)
    model.eval()

    # Now let's copy all the weights.
    # Embeddings
    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        model.roberta.embeddings.token_type_embeddings.weight
    )  # just zero them out b/c RoBERTa doesn't use them.
    model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
    model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias

    for i in range(config.num_hidden_layers):
        # Encoder: start of layer
        layer: BertLayer = model.roberta.encoder.layer[i]
        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[
            i]

        # self attention
        self_attn: BertSelfAttention = layer.attention.self
        assert (roberta_layer.self_attn.k_proj.weight.data.shape ==
                roberta_layer.self_attn.q_proj.weight.data.shape ==
                roberta_layer.self_attn.v_proj.weight.data.shape == torch.Size(
                    (config.hidden_size, config.hidden_size)))

        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias

        # self-attention output
        self_output: BertSelfOutput = layer.attention.output
        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
        self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
        self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias

        # intermediate
        intermediate: BertIntermediate = layer.intermediate
        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
        intermediate.dense.weight = roberta_layer.fc1.weight
        intermediate.dense.bias = roberta_layer.fc1.bias

        # output
        bert_output: BertOutput = layer.output
        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
        bert_output.dense.weight = roberta_layer.fc2.weight
        bert_output.dense.bias = roberta_layer.fc2.bias
        bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
        bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
        # end of layer

    if classification_head:
        model.classifier.dense.weight = roberta.model.classification_heads[
            "mnli"].dense.weight
        model.classifier.dense.bias = roberta.model.classification_heads[
            "mnli"].dense.bias
        model.classifier.out_proj.weight = roberta.model.classification_heads[
            "mnli"].out_proj.weight
        model.classifier.out_proj.bias = roberta.model.classification_heads[
            "mnli"].out_proj.bias
    else:
        # LM Head
        model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight
        model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
        model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
        model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
        model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
        model.lm_head.decoder.bias = roberta.model.decoder.lm_head.bias

    # Let's check that we get the same results.
    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(
        0)  # batch of size 1

    our_output = model(input_ids)[0]
    if classification_head:
        their_output = roberta.model.classification_heads["mnli"](
            roberta.extract_features(input_ids))
    else:
        their_output = roberta.model(input_ids)[0]
    print(our_output.shape, their_output.shape)
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
    success = torch.allclose(our_output, their_output, atol=1e-3)
    print("Do both models output the same tensors?",
          "🔥" if success else "💩")
    if not success:
        raise Exception("Something went wRoNg")

    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
Ejemplo n.º 4
0
    def __init__(self,
                 vocab: Vocabulary,
                 pretrained_model: str = None,
                 requires_grad: bool = True,
                 predictions_file=None,
                 layer_freeze_regexes: List[str] = None,
                 probe_type: str = None,
                 loss_on_all_vocab: bool = False,
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self._loss_on_all_vocab = loss_on_all_vocab

        self._predictions_file = predictions_file

        # TODO move to predict
        if predictions_file is not None and os.path.isfile(predictions_file):
            os.remove(predictions_file)

        self._pretrained_model = pretrained_model
        if 'roberta' in pretrained_model:
            self._padding_value = 1  # The index of the RoBERTa padding token
            if loss_on_all_vocab:
                self._transformer_model = RobertaForMaskedLM.from_pretrained(
                    pretrained_model)
            else:
                self._transformer_model = RobertaForMultiChoiceMaskedLM.from_pretrained(
                    pretrained_model)
        elif 'xlnet' in pretrained_model:
            self._padding_value = 5  # The index of the XLNet padding token
            self._transformer_model = XLNetLMHeadModel.from_pretrained(
                pretrained_model)
        elif 'albert' in pretrained_model:
            if loss_on_all_vocab:
                self._transformer_model = AlbertForMaskedLM.from_pretrained(
                    pretrained_model)
            else:
                self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained(
                    pretrained_model)
            self._padding_value = 0  # The index of the BERT padding token
        elif 'bert' in pretrained_model:
            if loss_on_all_vocab:
                self._transformer_model = BertForMaskedLM.from_pretrained(
                    pretrained_model)
            else:
                self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained(
                    pretrained_model)
            self._padding_value = 0  # The index of the BERT padding token
        else:
            assert (ValueError)

        if probe_type == 'MLP':
            layer_freeze_regexes = ["embeddings", "encoder", "pooler"]
        elif probe_type == 'linear':
            layer_freeze_regexes = [
                "embeddings", "encoder", "pooler", "dense", "LayerNorm",
                "layer_norm"
            ]

        for name, param in self._transformer_model.named_parameters():
            if layer_freeze_regexes and requires_grad:
                grad = not any(
                    [bool(re.search(r, name)) for r in layer_freeze_regexes])
            else:
                grad = requires_grad
            if grad:
                param.requires_grad = True
            else:
                param.requires_grad = False

        # make sure decode gredients are on.
        if 'roberta' in pretrained_model:
            self._transformer_model.lm_head.decoder.weight.requires_grad = True
            self._transformer_model.lm_head.bias.requires_grad = True
        elif 'albert' in pretrained_model:
            pass
        elif 'bert' in pretrained_model:
            self._transformer_model.cls.predictions.decoder.weight.requires_grad = True
            self._transformer_model.cls.predictions.bias.requires_grad = True

        transformer_config = self._transformer_model.config
        transformer_config.num_labels = 1
        self._output_dim = self._transformer_model.config.hidden_size

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()
        self._debug = 2
Ejemplo n.º 5
0
def convert_roberta_checkpoint_to_pytorch(fairseq_default_path, hf_input_path):
    """
    Copy/paste/tweak roberta's weights to our BERT structure.
    """
    roberta_hf = RobertaForMaskedLM.from_pretrained(hf_input_path)
    roberta_fairseq = FairseqRobertaModel.from_pretrained(fairseq_default_path)

    # Now let's copy all the weights.
    # Embeddings
    roberta_hf_sent_encoder = roberta_hf.roberta.embeddings
    roberta_fairseq.model.decoder.sentence_encoder.embed_tokens.weight = roberta_hf_sent_encoder.word_embeddings.weight
    # fairseq roberta doesn't use `token_type_embeddings`, so as a workaround, add it to the `position_embeddings`
    roberta_fairseq.model.decoder.sentence_encoder.embed_positions.weight.data = roberta_hf_sent_encoder.position_embeddings.weight.data + roberta_hf_sent_encoder.token_type_embeddings.weight.data
    roberta_fairseq.model.decoder.sentence_encoder.emb_layer_norm.weight = roberta_hf_sent_encoder.LayerNorm.weight
    roberta_fairseq.model.decoder.sentence_encoder.emb_layer_norm.bias = roberta_hf_sent_encoder.LayerNorm.bias

    for i in range(len(roberta_hf.roberta.encoder.layer)):
        # Encoder: start of layer
        roberta_hf_layer: BertLayer = roberta_hf.roberta.encoder.layer[i]
        roberta_fairseq_layer: TransformerSentenceEncoderLayer = roberta_fairseq.model.decoder.sentence_encoder.layers[i]
        # roberta_fairseq_layer.self_attn.enable_torch_version = False

        # self attention
        hf_self_attn: BertSelfAttention = roberta_hf_layer.attention.self
        fairseq_self_attn: BertSelfAttention = roberta_fairseq_layer.self_attn
        fairseq_self_attn.q_proj.weight = hf_self_attn.query.weight
        fairseq_self_attn.q_proj.bias = hf_self_attn.query.bias
        fairseq_self_attn.k_proj.weight = hf_self_attn.key.weight
        fairseq_self_attn.k_proj.bias = hf_self_attn.key.bias
        fairseq_self_attn.v_proj.weight = hf_self_attn.value.weight
        fairseq_self_attn.v_proj.bias = hf_self_attn.value.bias

        # self-attention output
        hf_self_output: BertSelfOutput = roberta_hf_layer.attention.output
        assert(
            hf_self_output.dense.weight.shape == roberta_fairseq_layer.self_attn.out_proj.weight.shape
        )
        roberta_fairseq_layer.self_attn.out_proj.weight = hf_self_output.dense.weight
        roberta_fairseq_layer.self_attn.out_proj.bias = hf_self_output.dense.bias
        roberta_fairseq_layer.self_attn_layer_norm.weight = hf_self_output.LayerNorm.weight
        roberta_fairseq_layer.self_attn_layer_norm.bias = hf_self_output.LayerNorm.bias

        # intermediate
        hf_intermediate: BertIntermediate = roberta_hf_layer.intermediate
        assert(
            hf_intermediate.dense.weight.shape == roberta_fairseq_layer.fc1.weight.shape
        )
        roberta_fairseq_layer.fc1.weight = hf_intermediate.dense.weight
        roberta_fairseq_layer.fc1.bias = hf_intermediate.dense.bias

        # output
        hf_bert_output: BertOutput = roberta_hf_layer.output
        assert(
            hf_bert_output.dense.weight.shape == roberta_fairseq_layer.fc2.weight.shape
        )
        roberta_fairseq_layer.fc2.weight = hf_bert_output.dense.weight
        roberta_fairseq_layer.fc2.bias = hf_bert_output.dense.bias
        roberta_fairseq_layer.final_layer_norm.weight = hf_bert_output.LayerNorm.weight
        roberta_fairseq_layer.final_layer_norm.bias = hf_bert_output.LayerNorm.bias
        # end of layer

    roberta_fairseq.model.decoder.lm_head.dense.weight = roberta_hf.lm_head.dense.weight
    roberta_fairseq.model.decoder.lm_head.dense.bias = roberta_hf.lm_head.dense.bias
    roberta_fairseq.model.decoder.lm_head.layer_norm.weight = roberta_hf.lm_head.layer_norm.weight
    roberta_fairseq.model.decoder.lm_head.layer_norm.bias = roberta_hf.lm_head.layer_norm.bias
    roberta_fairseq.model.decoder.lm_head.weight = roberta_hf.lm_head.decoder.weight
    roberta_fairseq.model.decoder.lm_head.bias = roberta_hf.lm_head.bias

    # Let's check that we get the same results.
    roberta_hf.eval()  # disable dropout
    roberta_fairseq.eval()  # disable dropout
    input_ids: torch.Tensor = roberta_fairseq.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
    our_output = roberta_hf(input_ids)[0]
    their_output = roberta_fairseq.model(input_ids)[0]
    print(our_output.shape, their_output.shape)
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
    success = torch.allclose(our_output, their_output, atol=1e-3)
    print(
        "Do both models output the same tensors?",
        "🔥" if success else "💩"
    )
    if not success:
        raise Exception("Something went wRoNg")

    with open(f'{fairseq_default_path}/model.pt', 'rb') as f:
        roberta_fairseq_checkpoint = torch.load(f)
    roberta_fairseq_checkpoint['model'] = roberta_fairseq.model.state_dict()
    fairseq_output_checkpoint_path = f'{hf_input_path}/fairseq.pt'
    print(f"Saving model to {fairseq_output_checkpoint_path}")
    with open(fairseq_output_checkpoint_path, 'wb') as f:
        torch.save(roberta_fairseq_checkpoint, f)
Ejemplo n.º 6
0
def convert_roberta_to_transformers(ckpt_path):
    ckpt = torch.load(ckpt_path, map_location="cpu")
    args = ckpt["args"]

    config = BertConfig(
        vocab_size_or_config_json_file=250002,
        hidden_size=args.encoder_embed_dim,
        num_hidden_layers=args.encoder_layers,
        num_attention_heads=args.encoder_attention_heads,
        intermediate_size=args.encoder_ffn_embed_dim,
        max_position_embeddings=args.max_positions + 2,
        type_vocab_size=1,
        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
    )

    print("Our BERT config:", config)

    stat_dict = ckpt["model"]
    new_stat_dict = {}

    model = RobertaForMaskedLM(config)
    model.eval()

    sent_enc = "decoder.sentence_encoder"
    new_stat_dict["roberta.embeddings.word_embeddings.weight"] = stat_dict[
        sent_enc + ".embed_tokens.weight"]
    new_stat_dict["roberta.embeddings.position_embeddings.weight"] = stat_dict[
        sent_enc + ".embed_positions.weight"]

    new_stat_dict[
        "roberta.embeddings.token_type_embeddings.weight"] = torch.zeros_like(
            model.roberta.embeddings.token_type_embeddings.weight)

    new_stat_dict["roberta.embeddings.LayerNorm.weight"] = stat_dict[
        sent_enc + ".emb_layer_norm.weight"]
    new_stat_dict["roberta.embeddings.LayerNorm.bias"] = stat_dict[
        sent_enc + ".emb_layer_norm.bias"]

    for i in range(config.num_hidden_layers):
        # Encoder: start of layer
        # layer: BertLayer = model.roberta.encoder.layer[i]
        layer = "roberta.encoder.layer.%d" % i
        roberta_layer = sent_enc + (".layers.%d" % i)

        ### self attention
        # self_attn: BertSelfAttention = layer.attention.self
        self_attn = layer + ".attention.self"
        assert(
          stat_dict[roberta_layer+".self_attn.k_proj.weight"].data.shape == \
          stat_dict[roberta_layer+".self_attn.q_proj.weight"].data.shape == \
          stat_dict[roberta_layer+".self_attn.v_proj.weight"].data.shape == \
          torch.Size((config.hidden_size, config.hidden_size))
        )

        new_stat_dict[self_attn +
                      ".query.weight"] = stat_dict[roberta_layer +
                                                   ".self_attn.q_proj.weight"]
        new_stat_dict[self_attn +
                      ".query.bias"] = stat_dict[roberta_layer +
                                                 ".self_attn.q_proj.bias"]
        new_stat_dict[self_attn +
                      ".key.weight"] = stat_dict[roberta_layer +
                                                 ".self_attn.k_proj.weight"]
        new_stat_dict[self_attn +
                      ".key.bias"] = stat_dict[roberta_layer +
                                               ".self_attn.k_proj.bias"]
        new_stat_dict[self_attn +
                      ".value.weight"] = stat_dict[roberta_layer +
                                                   ".self_attn.v_proj.weight"]
        new_stat_dict[self_attn +
                      ".value.bias"] = stat_dict[roberta_layer +
                                                 ".self_attn.v_proj.bias"]

        ### self-attention output
        # self_output: BertSelfOutput = layer.attention.output
        self_output = layer + ".attention.output"
        assert (model.roberta.encoder.layer[i].attention.output.dense.weight.
                shape == stat_dict[roberta_layer +
                                   ".self_attn.out_proj.weight"].shape)
        new_stat_dict[self_output + ".dense.weight"] = stat_dict[
            roberta_layer + ".self_attn.out_proj.weight"]
        new_stat_dict[self_output +
                      ".dense.bias"] = stat_dict[roberta_layer +
                                                 ".self_attn.out_proj.bias"]
        new_stat_dict[self_output + ".LayerNorm.weight"] = stat_dict[
            roberta_layer + ".self_attn_layer_norm.weight"]
        new_stat_dict[self_output + ".LayerNorm.bias"] = stat_dict[
            roberta_layer + ".self_attn_layer_norm.bias"]

        ### intermediate
        # intermediate: BertIntermediate = layer.intermediate
        intermediate = layer + ".intermediate"
        assert (model.roberta.encoder.layer[i].intermediate.dense.weight.shape
                == stat_dict[roberta_layer + ".fc1.weight"].shape)
        #TODO
        new_stat_dict[intermediate +
                      ".dense.weight"] = stat_dict[roberta_layer +
                                                   ".fc1.weight"]
        new_stat_dict[intermediate + ".dense.bias"] = stat_dict[roberta_layer +
                                                                ".fc1.bias"]

        ### output
        # bert_output: BertOutput = layer.output
        bert_output = layer + ".output"
        assert (model.roberta.encoder.layer[i].output.dense.weight.shape ==
                stat_dict[roberta_layer + ".fc2.weight"].shape)
        new_stat_dict[bert_output +
                      ".dense.weight"] = stat_dict[roberta_layer +
                                                   ".fc2.weight"]
        new_stat_dict[bert_output + ".dense.bias"] = stat_dict[roberta_layer +
                                                               ".fc2.bias"]
        new_stat_dict[bert_output + ".LayerNorm.weight"] = stat_dict[
            roberta_layer + ".final_layer_norm.weight"]
        new_stat_dict[bert_output +
                      ".LayerNorm.bias"] = stat_dict[roberta_layer +
                                                     ".final_layer_norm.bias"]
        #### end of layer

    new_stat_dict["lm_head.dense.weight"] = stat_dict[
        "decoder.lm_head.dense.weight"]
    new_stat_dict["lm_head.dense.bias"] = stat_dict[
        "decoder.lm_head.dense.bias"]
    new_stat_dict["lm_head.layer_norm.weight"] = stat_dict[
        "decoder.lm_head.layer_norm.weight"]
    new_stat_dict["lm_head.layer_norm.bias"] = stat_dict[
        "decoder.lm_head.layer_norm.bias"]
    new_stat_dict["lm_head.decoder.weight"] = stat_dict[
        "decoder.lm_head.weight"]
    new_stat_dict["lm_head.bias"] = stat_dict["decoder.lm_head.bias"]

    new_stat_dict[
        "roberta.pooler.dense.weight"] = model.roberta.pooler.dense.weight
    new_stat_dict[
        "roberta.pooler.dense.bias"] = model.roberta.pooler.dense.bias

    return new_stat_dict