def update_to_sparse_transformer(self, max_position, sparsity_config=SparsityConfig( num_heads=4, seq_len=1024)): self.extend_position_embedding(max_position) self.replace_model_self_attention_with_sparse_self_attention( max_position, sparsity_config)
def replace_model_self_attention_with_sparse_self_attention( model, max_position, # SparsityConfig parameters needs to be set accordingly sparsity_config=SparsityConfig(num_heads=4, seq_len=1024)): """This function replaces the self attention layers in model encoder with sparse self attention. It currently supports bert and roberta model and can be easily extended to any other models following similar steps here. For sparsityConfig, refer to the config class. Arguments: model: required: a transformer model max_position: required: an integer determining new position embedding size sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class Return: model: updated model; in which self attention layer has been repleaced with DeepSpeed Sparse Self Attention layer. """ if hasattr(model, 'bert'): model.config.max_position_embeddings = max_position replace_self_attention_layer_with_sparse_self_attention_layer( model.config, model.bert.encoder.layer, sparsity_config) elif hasattr(model, 'roberta'): model.config.max_position_embeddings = max_position + 2 replace_self_attention_layer_with_sparse_self_attention_layer( model.config, model.roberta.encoder.layer, sparsity_config) else: raise ValueError( 'Please extend \"update_model_self_attention_to_sparse_self_attention\" function to support \ your model type. It currently only supports \"bert\" & \"roberta\"!') return model
def replace_self_attention_layer_with_sparse_self_attention_layer( config, layers, # SparsityConfig parameters needs to be set accordingly sparsity_config=SparsityConfig(num_heads=4, seq_len=1024)): """This function replaces the self attention layers in attention layer with sparse self attention. For sparsityConfig, refer to the config class. Arguments: config: required: transformer model config layers: required: transformer model attention layers sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class Return: layers: updated attention layers; in which self attention layers have been repleaced with DeepSpeed Sparse Self Attention layer. """ for layer in layers: deepspeed_sparse_self_attn = BertSparseSelfAttention( config, sparsity_config) deepspeed_sparse_self_attn.query = layer.attention.self.query deepspeed_sparse_self_attn.key = layer.attention.self.key deepspeed_sparse_self_attn.value = layer.attention.self.value layer.attention.self = deepspeed_sparse_self_attn return layers
def replace_model_self_attention_with_sparse_self_attention( self, max_position, # SparsityConfig parameters needs to be set accordingly sparsity_config=SparsityConfig(num_heads=4, seq_len=1024)): """This function replaces the self attention layers in model encoder with sparse self attention. It currently supports bert and roberta model and can be easily extended to any other models following similar steps here. For sparsityConfig, refer to the config class. Arguments: model: required: a transformer model max_position: required: an integer determining new position embedding size sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class Return: model: updated model; in which self attention layer has been repleaced with DeepSpeed Sparse Self Attention layer. """ self.bert.config.max_position_embeddings = max_position self.replace_self_attention_layer_with_sparse_self_attention_layer( self.bert.config, self.bert.encoder.layer, sparsity_config)
def __init__( self, # SparsityConfig parameters needs to be set accordingly sparsity_config=SparsityConfig(num_heads=4), key_padding_mask_mode='add', attn_mask_mode='mul'): """Initialize the sparse self attention layer. Arguments: sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class. key_padding_mask_mode: optional: a string determining if key padding mask needs to be added, `add`, or be multiplied, `mul`. attn_mask_mode: optional: a string determining if attention mask needs to be added, `add`, or be multiplied, `mul`. """ super().__init__() # sparsity information self.sparsity_config = sparsity_config # mask modes self.key_padding_mask_mode = key_padding_mask_mode self.attn_mask_mode = attn_mask_mode
def replace_model_self_attention_with_sparse_self_attention( self, max_position, sparsity_config=SparsityConfig(num_heads=4)): self.config.max_position_embeddings = max_position tbs.replace_self_attention_layer_with_sparse_self_attention_layer( self.config, self.encoder.layer, sparsity_config)