Exemple #1
0
def xlmr_model():
    config = XLMRobertaConfig(
        vocab_size=251000,
        hidden_size=32,
        num_hidden_layers=5,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act='gelu',
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=256,
        type_vocab_size=2,
        is_decoder=False,
        initializer_range=0.02,
    )
    return XLMRobertaModel(config=config)
Exemple #2
0
def convert_xlm_roberta_xl_checkpoint_to_pytorch(roberta_checkpoint_path: str,
                                                 pytorch_dump_folder_path: str,
                                                 classification_head: bool):
    """
    Copy/paste/tweak roberta's weights to our BERT structure.
    """
    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
    roberta.eval()  # disable dropout
    roberta_sent_encoder = roberta.model.encoder.sentence_encoder
    config = XLMRobertaConfig(
        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
        hidden_size=roberta.cfg.model.encoder_embed_dim,
        num_hidden_layers=roberta.cfg.model.encoder_layers,
        num_attention_heads=roberta.cfg.model.encoder_attention_heads,
        intermediate_size=roberta.cfg.model.encoder_ffn_embed_dim,
        max_position_embeddings=514,
        type_vocab_size=1,
        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
    )
    if classification_head:
        config.num_labels = roberta.model.classification_heads[
            "mnli"].out_proj.weight.shape[0]

    print("Our RoBERTa config:", config)

    model = XLMRobertaXLForSequenceClassification(
        config) if classification_head else XLMRobertaXLForMaskedLM(config)
    model.eval()

    # Now let's copy all the weights.
    # Embeddings
    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        model.roberta.embeddings.token_type_embeddings.weight
    )  # just zero them out b/c RoBERTa doesn't use them.

    model.roberta.encoder.LayerNorm.weight = roberta_sent_encoder.layer_norm.weight
    model.roberta.encoder.LayerNorm.bias = roberta_sent_encoder.layer_norm.bias

    for i in range(config.num_hidden_layers):
        # Encoder: start of layer
        layer: BertLayer = model.roberta.encoder.layer[i]
        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[
            i]

        attention: RobertaAttention = layer.attention
        attention.self_attn_layer_norm.weight = roberta_layer.self_attn_layer_norm.weight
        attention.self_attn_layer_norm.bias = roberta_layer.self_attn_layer_norm.bias

        # self attention
        self_attn: BertSelfAttention = layer.attention.self
        assert (roberta_layer.self_attn.k_proj.weight.data.shape ==
                roberta_layer.self_attn.q_proj.weight.data.shape ==
                roberta_layer.self_attn.v_proj.weight.data.shape == torch.Size(
                    (config.hidden_size, config.hidden_size)))

        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias

        # self-attention output
        self_output: BertSelfOutput = layer.attention.output
        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias

        # this one is final layer norm
        layer.LayerNorm.weight = roberta_layer.final_layer_norm.weight
        layer.LayerNorm.bias = roberta_layer.final_layer_norm.bias

        # intermediate
        intermediate: BertIntermediate = layer.intermediate
        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
        intermediate.dense.weight = roberta_layer.fc1.weight
        intermediate.dense.bias = roberta_layer.fc1.bias

        # output
        bert_output: BertOutput = layer.output
        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
        bert_output.dense.weight = roberta_layer.fc2.weight
        bert_output.dense.bias = roberta_layer.fc2.bias
        # end of layer

    if classification_head:
        model.classifier.dense.weight = roberta.model.classification_heads[
            "mnli"].dense.weight
        model.classifier.dense.bias = roberta.model.classification_heads[
            "mnli"].dense.bias
        model.classifier.out_proj.weight = roberta.model.classification_heads[
            "mnli"].out_proj.weight
        model.classifier.out_proj.bias = roberta.model.classification_heads[
            "mnli"].out_proj.bias
    else:
        # LM Head
        model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight
        model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias
        model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight
        model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias
        model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight
        model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias

    # Let's check that we get the same results.
    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(
        0)  # batch of size 1

    our_output = model(input_ids)[0]
    if classification_head:
        their_output = roberta.model.classification_heads["mnli"](
            roberta.extract_features(input_ids))
    else:
        their_output = roberta.model(input_ids)[0]
    print(our_output.shape, their_output.shape)
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
    success = torch.allclose(our_output, their_output, atol=1e-3)
    print("Do both models output the same tensors?",
          "🔥" if success else "💩")
    if not success:
        raise Exception("Something went wRoNg")

    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)