Example #1
0
 def update_to_sparse_transformer(self,
                                  max_position,
                                  sparsity_config=SparsityConfig(
                                      num_heads=4, seq_len=1024)):
     self.extend_position_embedding(max_position)
     self.replace_model_self_attention_with_sparse_self_attention(
         max_position, sparsity_config)
Example #2
0
def replace_model_self_attention_with_sparse_self_attention(
    model,
    max_position,
    # SparsityConfig parameters needs to be set accordingly
    sparsity_config=SparsityConfig(num_heads=4, seq_len=1024)):
    """This function replaces the self attention layers in model encoder with sparse self attention.
    It currently supports bert and roberta model and can be easily extended to any other models following similar steps here.
    For sparsityConfig, refer to the config class.

    Arguments:
        model: required: a transformer model
        max_position: required: an integer determining new position embedding size
        sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class

    Return:
        model: updated model; in which self attention layer has been repleaced with DeepSpeed Sparse Self Attention layer.
    """

    if hasattr(model, 'bert'):
        model.config.max_position_embeddings = max_position
        replace_self_attention_layer_with_sparse_self_attention_layer(
            model.config, model.bert.encoder.layer, sparsity_config)
    elif hasattr(model, 'roberta'):
        model.config.max_position_embeddings = max_position + 2
        replace_self_attention_layer_with_sparse_self_attention_layer(
            model.config, model.roberta.encoder.layer, sparsity_config)
    else:
        raise ValueError(
            'Please extend \"update_model_self_attention_to_sparse_self_attention\" function to support \
				 your model type. It currently only supports \"bert\" & \"roberta\"!')
    return model
Example #3
0
def replace_self_attention_layer_with_sparse_self_attention_layer(
    config,
    layers,
    # SparsityConfig parameters needs to be set accordingly
    sparsity_config=SparsityConfig(num_heads=4, seq_len=1024)):
    """This function replaces the self attention layers in attention layer with sparse self attention.
    For sparsityConfig, refer to the config class.

    Arguments:
        config: required: transformer model config
        layers: required: transformer model attention layers
        sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class

    Return:
        layers: updated attention layers; in which self attention layers have been repleaced with DeepSpeed Sparse Self Attention layer.
    """

    for layer in layers:
        deepspeed_sparse_self_attn = BertSparseSelfAttention(
            config, sparsity_config)
        deepspeed_sparse_self_attn.query = layer.attention.self.query
        deepspeed_sparse_self_attn.key = layer.attention.self.key
        deepspeed_sparse_self_attn.value = layer.attention.self.value

        layer.attention.self = deepspeed_sparse_self_attn

    return layers
Example #4
0
    def replace_model_self_attention_with_sparse_self_attention(
        self,
        max_position,
        # SparsityConfig parameters needs to be set accordingly
        sparsity_config=SparsityConfig(num_heads=4, seq_len=1024)):
        """This function replaces the self attention layers in model encoder with sparse self attention.
        It currently supports bert and roberta model and can be easily extended to any other models following similar steps here.
        For sparsityConfig, refer to the config class.
        Arguments:
            model: required: a transformer model
            max_position: required: an integer determining new position embedding size
            sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class
        Return:
            model: updated model; in which self attention layer has been repleaced with DeepSpeed Sparse Self Attention layer.
        """

        self.bert.config.max_position_embeddings = max_position
        self.replace_self_attention_layer_with_sparse_self_attention_layer(
            self.bert.config, self.bert.encoder.layer, sparsity_config)
    def __init__(
            self,
            # SparsityConfig parameters needs to be set accordingly
            sparsity_config=SparsityConfig(num_heads=4),
            key_padding_mask_mode='add',
            attn_mask_mode='mul'):
        """Initialize the sparse self attention layer.
        Arguments:
            sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class.
            key_padding_mask_mode: optional: a string determining if key padding mask needs to be added, `add`, or be multiplied, `mul`.
            attn_mask_mode: optional: a string determining if attention mask needs to be added, `add`, or be multiplied, `mul`.
        """
        super().__init__()

        # sparsity information
        self.sparsity_config = sparsity_config

        # mask modes
        self.key_padding_mask_mode = key_padding_mask_mode
        self.attn_mask_mode = attn_mask_mode
    def replace_model_self_attention_with_sparse_self_attention(
        self, max_position, sparsity_config=SparsityConfig(num_heads=4)):

        self.config.max_position_embeddings = max_position
        tbs.replace_self_attention_layer_with_sparse_self_attention_layer(
            self.config, self.encoder.layer, sparsity_config)