Beispiel #1
0
def convert_to_long_model(model_name, tokenizer_name, save_model_to,
                          attention_window, max_pos):
    """
    Starting from the roberta-base checkpoint, the following function converts it into an instance 
    of RobertaLong.

    Args:
        save_model_to (str): path to output dir
        attention_window (int): 
        max_pos (int): max model position before adding extra 2 tokens for roberta models

    Returns:
        transformers.RobertaForMaskedLM: RoBERTa model with LM head on top
    """
    model = RobertaForMaskedLM.from_pretrained(model_name)
    tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_name,
                                                     model_max_length=max_pos)
    config = model.config

    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape
    max_pos += 2  # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2
    config.max_position_embeddings = max_pos
    assert max_pos > current_max_pos

    # allocate a larger position embedding matrix
    new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty(
        max_pos, embed_size)

    # copy position embeddings over and over to initialize the new position embeddings
    k = 2
    step = current_max_pos - 2
    while k < max_pos - 1:
        new_pos_embed[k:(
            k +
            step)] = model.roberta.embeddings.position_embeddings.weight[2:]
        k += step
    model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed

    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
    config.attention_window = [attention_window] * config.num_hidden_layers
    for i, layer in enumerate(model.roberta.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = layer.attention.self.query
        longformer_self_attn.key = layer.attention.self.key
        longformer_self_attn.value = layer.attention.self.value

        longformer_self_attn.query_global = layer.attention.self.query
        longformer_self_attn.key_global = layer.attention.self.key
        longformer_self_attn.value_global = layer.attention.self.value

        layer.attention.self = longformer_self_attn

    logger.info(f'      saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    return model, tokenizer
Beispiel #2
0
def create_long_model(model_type,
                      model,
                      tokenizer,
                      config,
                      attention_window=512,
                      max_pos=4096):
    """Convert RoBERTa to Longformer.
    for other model_type like BERT, replacing model.encoder.layer.attention.self to LongformerSelfAttension()
    is not available at this time.
    """
    from transformers.modeling_longformer import LongformerSelfAttention
    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.embeddings.position_embeddings.weight.shape

    if model_type in ['roberta']:
        max_pos += 2  # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2

    config.max_position_embeddings = max_pos
    assert max_pos > current_max_pos

    # allocate a larger position embedding matrix
    new_pos_embed = model.embeddings.position_embeddings.weight.new_empty(
        max_pos, embed_size)

    # copy position embeddings over and over to initialize the new position embeddings
    k = 0
    step = current_max_pos
    b = 0
    if model_type in ['roberta']:  # NOTE: RoBERTa has positions 0,1 reserved
        k = 2
        step = current_max_pos - 2
        b = 2
    while k < max_pos - 1:
        new_pos_embed[k:(
            k + step)] = model.embeddings.position_embeddings.weight[b:]
        k += step
    model.embeddings.position_embeddings.weight.data = new_pos_embed

    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
    config.attention_window = [attention_window] * config.num_hidden_layers
    for i, layer in enumerate(model.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = layer.attention.self.query
        longformer_self_attn.key = layer.attention.self.key
        longformer_self_attn.value = layer.attention.self.value

        longformer_self_attn.query_global = layer.attention.self.query
        longformer_self_attn.key_global = layer.attention.self.key
        longformer_self_attn.value_global = layer.attention.self.value

        layer.attention.self = longformer_self_attn

    return model, tokenizer, config
Beispiel #3
0
def create_long_model(save_model_to, attention_window, max_pos):
    model = BertForMaskedLM.from_pretrained(
        "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
    config = model.config
    tokenizer = BertTokenizerFast.from_pretrained(
        "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
        model_max_length=max_pos)
    #tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', model_max_length=max_pos)
    #pdb.set_trace()
    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.bert.embeddings.position_embeddings.weight.shape
    #max_pos += 2  # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2
    config.max_position_embeddings = max_pos
    assert max_pos > current_max_pos
    # allocate a larger position embedding matrix
    new_pos_embed = model.bert.embeddings.position_embeddings.weight.new_empty(
        max_pos, embed_size)
    model.bert.embeddings.register_buffer(
        "position_ids",
        torch.arange(config.max_position_embeddings).expand((1, -1)),
    )

    # copy position embeddings over and over to initialize the new position embeddings
    k = 0
    step = current_max_pos
    while k < max_pos - 1:
        new_pos_embed[k:(
            k + step)] = model.bert.embeddings.position_embeddings.weight
        k += step
    model.bert.embeddings.position_embeddings.weight.data = new_pos_embed

    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
    config.attention_window = [attention_window] * config.num_hidden_layers
    for i, layer in enumerate(model.bert.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = layer.attention.self.query
        longformer_self_attn.key = layer.attention.self.key
        longformer_self_attn.value = layer.attention.self.value

        longformer_self_attn.query_global = layer.attention.self.query
        longformer_self_attn.key_global = layer.attention.self.key
        longformer_self_attn.value_global = layer.attention.self.value

        layer.attention.self = longformer_self_attn

    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    #pdb.set_trace()
    return model, tokenizer
Beispiel #4
0
    def create_long_model(model_name, save_model_to, attention_window,
                          max_pos):
        model = BertForMaskedLM.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                  model_max_length=max_pos)
        config = model.config

        # extend position embeddings
        tokenizer.model_max_length = max_pos
        tokenizer.init_kwargs['model_max_length'] = max_pos
        current_max_pos, embed_size = model.bert.embeddings.position_embeddings.weight.shape

        config.max_position_embeddings = max_pos
        assert max_pos > current_max_pos
        # allocate a larger position embedding matrix
        new_pos_embed = model.bert.embeddings.position_embeddings.weight.new_empty(
            max_pos, embed_size)
        # copy position embeddings over and over to initialize the new position embeddings
        k = 0
        step = current_max_pos

        while k < max_pos - 1:
            if (k + step < max_pos):
                new_pos_embed[k:(
                    k +
                    step)] = model.bert.embeddings.position_embeddings.weight
            k += step

        model.bert.embeddings.position_embeddings.weight.data = new_pos_embed

        # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
        config.attention_window = [attention_window] * config.num_hidden_layers

        for i, layer in enumerate(model.bert.encoder.layer):
            longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
            longformer_self_attn.query = layer.attention.self.query
            longformer_self_attn.key = layer.attention.self.key
            longformer_self_attn.value = layer.attention.self.value

            longformer_self_attn.query_global = layer.attention.self.query
            longformer_self_attn.key_global = layer.attention.self.key
            longformer_self_attn.value_global = layer.attention.self.value

            layer.attention.self = longformer_self_attn

        logger.info(f'saving model to {save_model_to}')
        model.save_pretrained(save_model_to)
        tokenizer.save_pretrained(save_model_to)
        return model, tokenizer
Beispiel #5
0
def create_long_model(save_model_to, attention_window, max_pos):
    model = RobertaForMaskedLM.from_pretrained('roberta-base')
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base',
                                                     model_max_length=max_pos)
    config = model.config

    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape
    max_pos += 2  # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2
    config.max_position_embeddings = max_pos
    assert max_pos > current_max_pos
    # allocate a larger position embedding matrix
    new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty(
        max_pos, embed_size)
    # copy position embeddings over and over to initialize the new position embeddings
    k = 2
    step = current_max_pos - 2
    while k < max_pos - 1:
        new_pos_embed[k:(
            k +
            step)] = model.roberta.embeddings.position_embeddings.weight[2:]
        k += step
    model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed
    #     model.roberta.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos)

    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
    config.attention_window = [attention_window] * config.num_hidden_layers
    for i, layer in enumerate(model.roberta.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = layer.attention.self.query
        longformer_self_attn.key = layer.attention.self.key
        longformer_self_attn.value = layer.attention.self.value

        longformer_self_attn.query_global = copy.deepcopy(
            layer.attention.self.query)
        longformer_self_attn.key_global = copy.deepcopy(
            layer.attention.self.key)
        longformer_self_attn.value_global = copy.deepcopy(
            layer.attention.self.value)

        layer.attention.self = longformer_self_attn

    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    return model, tokenizer
Beispiel #6
0
def create_long_model(save_model_to, attention_window, max_pos,
                      pretrained_config, pretrained_checkpoint,
                      pretrained_tokenizer):
    """
    Convert RoBERTa into Long-Version
    :param save_model_to: the model save path
    :param attention_window: the long-attention defined above
    :param max_pos: extend the position embedding to max_pos=4096
    :return: modified model and tokenizer
    """
    config = BertConfig.from_pretrained(pretrained_config)
    model = BertForMaskedLM.from_pretrained(pretrained_checkpoint,
                                            config=config)
    tokenizer = BertTokenizerFast.from_pretrained(pretrained_tokenizer,
                                                  model_max_length=max_pos)

    # extend position embedding
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.bert.embeddings.position_embeddings.weight.shape
    # RoBERTa has position 0,1 reserved, embedding size = max_pos + 2
    #max_pos += 2 # ??? is this fit for BERT-based RoBerta_zh?
    """ 
    RoBERTa reserved position 0 1,
    However, Bert-based RoBERTa_zh did not.
    """
    config.max_position_embeddings = max_pos
    assert max_pos > current_max_pos

    # allocate a larger position embedding matrix
    new_pos_embed = model.bert.embeddings.position_embeddings.weight.new_empty(
        max_pos, embed_size)

    # init by duplication
    k = 0
    step = current_max_pos
    while k < max_pos - 1:
        new_pos_embed[k:(
            k + step)] = model.bert.embeddings.position_embeddings.weight[0:]
        k += step
    model.bert.embeddings.position_embeddings.weight.data = new_pos_embed

    # The next problem is that: BERT_Based RoBERTa has not attribute [position_ids] for [bert.embeddings]
    # model.bert.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos)

    # replace the modeling_bert.BertSelfAttention obj with LongformerSelfAttention
    config.attention_window = [attention_window] * config.num_hidden_layers
    for i, layer in enumerate(model.bert.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = layer.attention.self.query
        longformer_self_attn.key = layer.attention.self.key
        longformer_self_attn.value = layer.attention.self.value

        longformer_self_attn.query_global = copy.deepcopy(
            layer.attention.self.query)
        longformer_self_attn.key_global = copy.deepcopy(
            layer.attention.self.key)
        longformer_self_attn.value_global = copy.deepcopy(
            layer.attention.self.value)

        layer.attention.self = longformer_self_attn

    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    return model, tokenizer
Beispiel #7
0
def create_long_model(model_specified, attention_window, max_pos,
                      save_model_to):
    """Starting from the `roberta-base` (or similar) checkpoint, the following function converts it into an instance of `RobertaLong`.
     It makes the following changes:
        1)extend the position embeddings from `512` positions to `max_pos`. In Longformer, we set `max_pos=4096`
        2)initialize the additional position embeddings by copying the embeddings of the first `512` positions.
            This initialization is crucial for the model performance (check table 6 in [the paper](https://arxiv.org/pdf/2004.05150.pdf)
            for performance without this initialization)
        3) replaces `modeling_bert.BertSelfAttention` objects with `modeling_longformer.LongformerSelfAttention` with a attention window size `attention_window`

        The output of this function works for long documents even without pretraining.
        Check tables 6 and 11 in [the paper](https://arxiv.org/pdf/2004.05150.pdf) to get a sense of 
        the expected performance of this model before pretraining."""

    model = RobertaForMaskedLM.from_pretrained(
        model_specified)  #,gradient_checkpointing=True)

    tokenizer = RobertaTokenizer.from_pretrained(model_specified,
                                                 model_max_length=max_pos)

    config = model.config

    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape
    max_pos += 2  # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2
    config.max_position_embeddings = max_pos
    assert max_pos > current_max_pos
    # allocate a larger position embedding matrix
    new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty(
        max_pos, embed_size)
    # copy position embeddings over and over to initialize the new position embeddings
    k = 2
    step = current_max_pos - 2
    while k < max_pos - 1:
        new_pos_embed[k:(
            k +
            step)] = model.roberta.embeddings.position_embeddings.weight[2:]
        k += step

    model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed
    model.roberta.embeddings.position_embeddings.num_embeddings = len(
        new_pos_embed.data)

    # # first, check that model.roberta.embeddings.position_embeddings.weight.data.shape is correct — has to be 4096 (default) of your desired length
    # model.roberta.embeddings.position_ids = torch.arange(
    #     0, model.roberta.embeddings.position_embeddings.num_embeddings
    # )[None]

    model.roberta.embeddings.position_ids.data = torch.tensor(
        [i for i in range(max_pos)]).reshape(1, max_pos)

    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
    config.attention_window = [attention_window] * config.num_hidden_layers
    for i, layer in enumerate(model.roberta.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = copy.deepcopy(layer.attention.self.query)
        longformer_self_attn.key = copy.deepcopy(layer.attention.self.key)
        longformer_self_attn.value = copy.deepcopy(layer.attention.self.value)

        longformer_self_attn.query_global = copy.deepcopy(
            layer.attention.self.query)
        longformer_self_attn.key_global = copy.deepcopy(
            layer.attention.self.key)
        longformer_self_attn.value_global = copy.deepcopy(
            layer.attention.self.value)

        layer.attention.self = longformer_self_attn

    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
def create_long_model(
        save_model_to,
        model,
        tokenizer,
        attention_window,
        model_max_length
):

    config = model.config
    position_embeddings = model.roberta.embeddings.position_embeddings

    tokenizer.model_max_length = model_max_length
    tokenizer.init_kwargs['model_max_length'] = model_max_length
    current_model_max_length, embed_size = position_embeddings.weight.shape

    # NOTE: RoBERTa has positions 0,1 reserved
    # embedding size is max position + 2
    model_max_length += 2
    config.max_position_embeddings = model_max_length
    assert model_max_length > current_model_max_length, \
        "New model max_length must be longer than current max_length"

    # BUG for XLM: Need to make all zeros sice too large base model
    new_pos_embed = position_embeddings.weight.new_zeros(
        model_max_length, embed_size
    )

    k = 2
    step = current_model_max_length - 2
    while k < model_max_length - 1:
        new_pos_embed[k:(
            k + step)] = position_embeddings.weight[2:]
        k += step

    # HACK for Huggingface transformers >=3.4.0 and < 4.0
    # https://github.com/huggingface/transformers/issues/6465#issuecomment-719042969
    position_embeddings.weight.data = new_pos_embed
    model.roberta.embeddings.position_embeddings.num_embeddings = len(
        new_pos_embed.data
    )
    num_model_embeddings = position_embeddings.num_embeddings
    model.roberta.embeddings.position_ids = torch.arange(
        0, num_model_embeddings
    )[None]

    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
    config.attention_window = [attention_window] * config.num_hidden_layers
    for i, layer in enumerate(model.roberta.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = layer.attention.self.query
        longformer_self_attn.key = layer.attention.self.key
        longformer_self_attn.value = layer.attention.self.value

        longformer_self_attn.query_global = layer.attention.self.query
        longformer_self_attn.key_global = layer.attention.self.key
        longformer_self_attn.value_global = layer.attention.self.value

        layer.attention.self = longformer_self_attn

    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    return model, tokenizer