Example #1
0
    def __init__(self, args: argparse.Namespace):
        """Initialize a model, tokenizer and config."""
        super().__init__()
        self.args = args
        if isinstance(args, argparse.Namespace):
            self.save_hyperparameters(args)
        self.tokenizer = RobertaTokenizer.from_pretrained(
            self.args.roberta_path)
        self.model = RobertaForMaskedLM.from_pretrained(self.args.roberta_path)

        self.robert_config = RobertaConfig.from_pretrained(
            self.args.roberta_path, output_hidden_states=False)
        self.model = RobertaForMaskedLM(self.robert_config)

        self.loss_fn = CrossEntropyLoss(reduction="none")
        self.acc = MaskedAccuracy(num_classes=self.tokenizer.vocab_size)
Example #2
0
def convert_pytorch_to_roberta_checkpoint(pytorch_checkpoint_path: str,
                                          roberta_dump_folder_path: str):
    """
    Copy/paste/tweak roberta's weights to our BERT structure.
    """
    import pickle
    model = RobertaForMaskedLM.from_pretrained(pytorch_checkpoint_path)
    config = RobertaConfig.from_pretrained(pytorch_checkpoint_path)
    from argparse import Namespace

    huggingface_train_args = Namespace(
        **vars(torch.load(f"{pytorch_checkpoint_path}/training_args.bin")))
    model.eval()  # disable dropout

    # tokenizer = RobertaTokenizer.from_pretrained(roberta_checkpoint_path)
    if config.num_hidden_layers == 12:
        roberta = FairseqRobertaModel.from_pretrained("roberta.base")
    elif config.num_hidden_layers == 24:
        roberta = FairseqRobertaModel.from_pretrained("roberta.large")
    else:
        raise Exception("Only roberta LM is supported!")
    roberta.eval()
    # roberta_sent_encoder = roberta.model.decoder.sentence_encoder

    # update config from huggingface and reuse lots of settings from fairseq pretrained
    roberta.args.warmup_updates = huggingface_train_args.warmup_steps
    roberta.args.weight_decay = huggingface_train_args.weight_decay
    roberta.args.adam_eps = huggingface_train_args.adam_epsilon
    roberta.args.clip_norm = huggingface_train_args.max_grad_norm
    roberta.args.max_update = huggingface_train_args.max_steps
    roberta.args.total_num_update = huggingface_train_args.max_steps
    roberta.args.save_interval_updates = huggingface_train_args.save_steps

    roberta.args.attention_dropout = config.attention_probs_dropout_prob
    roberta.args.encoder_embed_dim = config.hidden_size
    roberta.args.encoder_ffn_embed_dim = config.intermediate_size
    roberta.args.activation_fn = config.hidden_act
    roberta.args.activation_dropout = config.hidden_dropout_prob
    roberta.args.encoder_layers = config.num_hidden_layers
    roberta.args.encoder_attention_heads = config.num_attention_heads
    roberta.args.__dict__.update(huggingface_train_args.__dict__)

    roberta.model.decoder.sentence_encoder.embed_tokens.weight = model.roberta.embeddings.word_embeddings.weight
    roberta.model.decoder.sentence_encoder.embed_positions.weight = model.roberta.embeddings.position_embeddings.weight
    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        model.roberta.embeddings.token_type_embeddings.weight
    )  # just zero them out b/c RoBERTa doesn't use them.
    roberta.model.decoder.sentence_encoder.emb_layer_norm.weight = model.roberta.embeddings.LayerNorm.weight
    roberta.model.decoder.sentence_encoder.emb_layer_norm.bias = model.roberta.embeddings.LayerNorm.bias

    for i in range(config.num_hidden_layers):
        # Encoder: start of layer
        layer: BertLayer = model.roberta.encoder.layer[i]
        # roberta.model.decoder.sentence_encoder.layers[i]: TransformerSentenceEncoderLayer = roberta.model.decoder.sentence_encoder.layers[i]

        # self attention
        self_attn: BertSelfAttention = layer.attention.self
        assert (roberta.model.decoder.sentence_encoder.layers[i].self_attn.
                k_proj.weight.data.shape == roberta.model.decoder.
                sentence_encoder.layers[i].self_attn.q_proj.weight.data.shape
                == roberta.model.decoder.sentence_encoder.layers[i].self_attn.
                v_proj.weight.data.shape == torch.Size(
                    (config.hidden_size, config.hidden_size)))

        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.q_proj.weight = self_attn.query.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.q_proj.bias = self_attn.query.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.k_proj.weight = self_attn.key.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.k_proj.bias = self_attn.key.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.v_proj.weight = self_attn.value.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.v_proj.bias = self_attn.value.bias

        # self-attention output
        self_output: BertSelfOutput = layer.attention.output
        assert self_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.out_proj.weight.shape
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.out_proj.weight = self_output.dense.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn.out_proj.bias = self_output.dense.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn_layer_norm.weight = self_output.LayerNorm.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].self_attn_layer_norm.bias = self_output.LayerNorm.bias

        # intermediate
        intermediate: BertIntermediate = layer.intermediate
        assert intermediate.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[
            i].fc1.weight.shape
        roberta.model.decoder.sentence_encoder.layers[
            i].fc1.weight = intermediate.dense.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].fc1.bias = intermediate.dense.bias

        # output
        bert_output: BertOutput = layer.output
        assert bert_output.dense.weight.shape == roberta.model.decoder.sentence_encoder.layers[
            i].fc2.weight.shape
        roberta.model.decoder.sentence_encoder.layers[
            i].fc2.weight = bert_output.dense.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].fc2.bias = bert_output.dense.bias
        roberta.model.decoder.sentence_encoder.layers[
            i].final_layer_norm.weight = bert_output.LayerNorm.weight
        roberta.model.decoder.sentence_encoder.layers[
            i].final_layer_norm.bias = bert_output.LayerNorm.bias

    # LM Head
    roberta.model.decoder.lm_head.dense.weight = model.lm_head.dense.weight
    roberta.model.decoder.lm_head.dense.bias = model.lm_head.dense.bias
    roberta.model.decoder.lm_head.layer_norm.weight = model.lm_head.layer_norm.weight
    roberta.model.decoder.lm_head.layer_norm.bias = model.lm_head.layer_norm.bias
    roberta.model.decoder.lm_head.weight = model.lm_head.decoder.weight
    roberta.model.decoder.lm_head.bias = model.lm_head.decoder.bias

    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(
        0)  # batch of size 1
    their_output = model(input_ids)[0]
    our_output = roberta.model(input_ids)[0]

    print(our_output.shape, their_output.shape)
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
    copy_success = torch.allclose(our_output, their_output, atol=1e-3)
    print("Do both models output the same tensors?",
          "🔥" if copy_success else "💩")
    if not copy_success:
        raise Exception("Something went wRoNg")

    pathlib.Path(roberta_dump_folder_path).mkdir(parents=True, exist_ok=True)
    print(f"Saving model to {roberta_dump_folder_path}")
    from fairseq import checkpoint_utils
    state_dict = {
        "args":
        roberta.args,
        "model":
        roberta.model.state_dict(),
        # these last two were copied from fairseq pretrained just to make .from_pretrain() function works
        "extra_state": {
            'train_iterator': {
                'epoch': 0
            },
            'val_loss': 1.4955725940408326
        },
        "optimizer_history": [{
            'criterion_name': 'MaskedLmLoss',
            'optimizer_name': 'MemoryEfficientFP16Optimizer',
            'lr_scheduler_state': {
                'best': 1.495530066777925
            },
            'num_updates': 500000
        }]
    }
    from fairseq import checkpoint_utils
    # checkpoint_utils.save_state(f"{roberta_dump_folder_path}/model.pt", roberta.args, roberta.state_dict(), )
    # del model
    checkpoint_utils.torch_persistent_save(
        state_dict, f"{roberta_dump_folder_path}/model.pt")
    loaded_model = FairseqRobertaModel.from_pretrained(
        roberta_dump_folder_path)
    loaded_model.eval()

    # roberta.model(input_ids)
    # loaded_model.model(input_ids)

    del state_dict
    copied_dict = roberta.state_dict()
    loaded_dict = loaded_model.state_dict()
    assert loaded_model.state_dict().keys() == roberta.state_dict().keys()
    for k in roberta.state_dict().keys():
        loaded_val = loaded_dict[k]
        copied_val = copied_dict[k]
        if not torch.allclose(loaded_val, copied_val, atol=1e-3):
            print(k)
    loaded_output = loaded_model.model(input_ids)[0]
    save_success = torch.allclose(our_output, loaded_output, atol=1e-3)
    print("Do both models output the same tensors?",
          "🔥" if save_success else "💩")
    if not save_success:
        raise Exception("Something went wRoNg")
    # except:
    #     print("Fail to save")
    # torch.save(roberta, f"{roberta_dump_folder_path}/model.pt")
    print("Done")
Example #3
0
    def __init__(self,
                 vocab: Vocabulary,
                 pretrained_model: str = None,
                 requires_grad: bool = True,
                 predictions_file=None,
                 layer_freeze_regexes: List[str] = None,
                 probe_type: str = None,
                 loss_on_all_vocab: bool = False,
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self._loss_on_all_vocab = loss_on_all_vocab

        self._predictions_file = predictions_file

        # TODO move to predict
        if predictions_file is not None and os.path.isfile(predictions_file):
            os.remove(predictions_file)

        self._pretrained_model = pretrained_model
        if 'roberta' in pretrained_model:
            self._padding_value = 1  # The index of the RoBERTa padding token
            if loss_on_all_vocab:
                self._transformer_model = RobertaForMaskedLM.from_pretrained(
                    pretrained_model)
            else:
                self._transformer_model = RobertaForMultiChoiceMaskedLM.from_pretrained(
                    pretrained_model)
        elif 'xlnet' in pretrained_model:
            self._padding_value = 5  # The index of the XLNet padding token
            self._transformer_model = XLNetLMHeadModel.from_pretrained(
                pretrained_model)
        elif 'albert' in pretrained_model:
            if loss_on_all_vocab:
                self._transformer_model = AlbertForMaskedLM.from_pretrained(
                    pretrained_model)
            else:
                self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained(
                    pretrained_model)
            self._padding_value = 0  # The index of the BERT padding token
        elif 'bert' in pretrained_model:
            if loss_on_all_vocab:
                self._transformer_model = BertForMaskedLM.from_pretrained(
                    pretrained_model)
            else:
                self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained(
                    pretrained_model)
            self._padding_value = 0  # The index of the BERT padding token
        else:
            assert (ValueError)

        if probe_type == 'MLP':
            layer_freeze_regexes = ["embeddings", "encoder", "pooler"]
        elif probe_type == 'linear':
            layer_freeze_regexes = [
                "embeddings", "encoder", "pooler", "dense", "LayerNorm",
                "layer_norm"
            ]

        for name, param in self._transformer_model.named_parameters():
            if layer_freeze_regexes and requires_grad:
                grad = not any(
                    [bool(re.search(r, name)) for r in layer_freeze_regexes])
            else:
                grad = requires_grad
            if grad:
                param.requires_grad = True
            else:
                param.requires_grad = False

        # make sure decode gredients are on.
        if 'roberta' in pretrained_model:
            self._transformer_model.lm_head.decoder.weight.requires_grad = True
            self._transformer_model.lm_head.bias.requires_grad = True
        elif 'albert' in pretrained_model:
            pass
        elif 'bert' in pretrained_model:
            self._transformer_model.cls.predictions.decoder.weight.requires_grad = True
            self._transformer_model.cls.predictions.bias.requires_grad = True

        transformer_config = self._transformer_model.config
        transformer_config.num_labels = 1
        self._output_dim = self._transformer_model.config.hidden_size

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()
        self._debug = 2
Example #4
0
def convert_roberta_checkpoint_to_pytorch(fairseq_default_path, hf_input_path):
    """
    Copy/paste/tweak roberta's weights to our BERT structure.
    """
    roberta_hf = RobertaForMaskedLM.from_pretrained(hf_input_path)
    roberta_fairseq = FairseqRobertaModel.from_pretrained(fairseq_default_path)

    # Now let's copy all the weights.
    # Embeddings
    roberta_hf_sent_encoder = roberta_hf.roberta.embeddings
    roberta_fairseq.model.decoder.sentence_encoder.embed_tokens.weight = roberta_hf_sent_encoder.word_embeddings.weight
    # fairseq roberta doesn't use `token_type_embeddings`, so as a workaround, add it to the `position_embeddings`
    roberta_fairseq.model.decoder.sentence_encoder.embed_positions.weight.data = roberta_hf_sent_encoder.position_embeddings.weight.data + roberta_hf_sent_encoder.token_type_embeddings.weight.data
    roberta_fairseq.model.decoder.sentence_encoder.emb_layer_norm.weight = roberta_hf_sent_encoder.LayerNorm.weight
    roberta_fairseq.model.decoder.sentence_encoder.emb_layer_norm.bias = roberta_hf_sent_encoder.LayerNorm.bias

    for i in range(len(roberta_hf.roberta.encoder.layer)):
        # Encoder: start of layer
        roberta_hf_layer: BertLayer = roberta_hf.roberta.encoder.layer[i]
        roberta_fairseq_layer: TransformerSentenceEncoderLayer = roberta_fairseq.model.decoder.sentence_encoder.layers[i]
        # roberta_fairseq_layer.self_attn.enable_torch_version = False

        # self attention
        hf_self_attn: BertSelfAttention = roberta_hf_layer.attention.self
        fairseq_self_attn: BertSelfAttention = roberta_fairseq_layer.self_attn
        fairseq_self_attn.q_proj.weight = hf_self_attn.query.weight
        fairseq_self_attn.q_proj.bias = hf_self_attn.query.bias
        fairseq_self_attn.k_proj.weight = hf_self_attn.key.weight
        fairseq_self_attn.k_proj.bias = hf_self_attn.key.bias
        fairseq_self_attn.v_proj.weight = hf_self_attn.value.weight
        fairseq_self_attn.v_proj.bias = hf_self_attn.value.bias

        # self-attention output
        hf_self_output: BertSelfOutput = roberta_hf_layer.attention.output
        assert(
            hf_self_output.dense.weight.shape == roberta_fairseq_layer.self_attn.out_proj.weight.shape
        )
        roberta_fairseq_layer.self_attn.out_proj.weight = hf_self_output.dense.weight
        roberta_fairseq_layer.self_attn.out_proj.bias = hf_self_output.dense.bias
        roberta_fairseq_layer.self_attn_layer_norm.weight = hf_self_output.LayerNorm.weight
        roberta_fairseq_layer.self_attn_layer_norm.bias = hf_self_output.LayerNorm.bias

        # intermediate
        hf_intermediate: BertIntermediate = roberta_hf_layer.intermediate
        assert(
            hf_intermediate.dense.weight.shape == roberta_fairseq_layer.fc1.weight.shape
        )
        roberta_fairseq_layer.fc1.weight = hf_intermediate.dense.weight
        roberta_fairseq_layer.fc1.bias = hf_intermediate.dense.bias

        # output
        hf_bert_output: BertOutput = roberta_hf_layer.output
        assert(
            hf_bert_output.dense.weight.shape == roberta_fairseq_layer.fc2.weight.shape
        )
        roberta_fairseq_layer.fc2.weight = hf_bert_output.dense.weight
        roberta_fairseq_layer.fc2.bias = hf_bert_output.dense.bias
        roberta_fairseq_layer.final_layer_norm.weight = hf_bert_output.LayerNorm.weight
        roberta_fairseq_layer.final_layer_norm.bias = hf_bert_output.LayerNorm.bias
        # end of layer

    roberta_fairseq.model.decoder.lm_head.dense.weight = roberta_hf.lm_head.dense.weight
    roberta_fairseq.model.decoder.lm_head.dense.bias = roberta_hf.lm_head.dense.bias
    roberta_fairseq.model.decoder.lm_head.layer_norm.weight = roberta_hf.lm_head.layer_norm.weight
    roberta_fairseq.model.decoder.lm_head.layer_norm.bias = roberta_hf.lm_head.layer_norm.bias
    roberta_fairseq.model.decoder.lm_head.weight = roberta_hf.lm_head.decoder.weight
    roberta_fairseq.model.decoder.lm_head.bias = roberta_hf.lm_head.bias

    # Let's check that we get the same results.
    roberta_hf.eval()  # disable dropout
    roberta_fairseq.eval()  # disable dropout
    input_ids: torch.Tensor = roberta_fairseq.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
    our_output = roberta_hf(input_ids)[0]
    their_output = roberta_fairseq.model(input_ids)[0]
    print(our_output.shape, their_output.shape)
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
    success = torch.allclose(our_output, their_output, atol=1e-3)
    print(
        "Do both models output the same tensors?",
        "🔥" if success else "💩"
    )
    if not success:
        raise Exception("Something went wRoNg")

    with open(f'{fairseq_default_path}/model.pt', 'rb') as f:
        roberta_fairseq_checkpoint = torch.load(f)
    roberta_fairseq_checkpoint['model'] = roberta_fairseq.model.state_dict()
    fairseq_output_checkpoint_path = f'{hf_input_path}/fairseq.pt'
    print(f"Saving model to {fairseq_output_checkpoint_path}")
    with open(fairseq_output_checkpoint_path, 'wb') as f:
        torch.save(roberta_fairseq_checkpoint, f)