def __init__(self, config): super().__init__() self.attention = BertAttention(config) self.is_decoder = config.is_decoder if self.is_decoder: self.crossattention = BertAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config)
def __init__(self, config): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 self.attention = BertAttention(config) self.is_decoder = config.is_decoder self.add_cross_attention = config.add_cross_attention if self.add_cross_attention: assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added" self.crossattention = BertAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config)
def __init__(self, config): super().__init__() # The cross-attention Layer self.visual_attention = BertCrossattLayer(config) # Self-attention Layers self.lang_self_att = BertAttention(config) self.visn_self_att = BertAttention(config) # Intermediate and Output Layers (FFNs) self.lang_inter = BertIntermediate(config) self.lang_output = BertOutput(config) self.visn_inter = BertIntermediate(config) self.visn_output = BertOutput(config)
def from_torch(attention: TorchBertAttention): params = {k: v for k, v in attention.named_parameters()} with torch.no_grad(): # merge self.query.weight, self.query.weight and self.query.weight together as qkv.weight qkv_weight = torch.clone( torch.t( torch.cat((params['self.query.weight'], params['self.key.weight'], params['self.value.weight']), 0).contiguous()).contiguous()) qkv_bias = torch.cat( (params['self.query.bias'], params['self.key.bias'], params['self.value.bias']), 0).contiguous() output_weight = torch.clone( torch.t(params['output.dense.weight']).contiguous()) att = BertAttention( convert2tt_tensor(qkv_weight), convert2tt_tensor(qkv_bias), convert2tt_tensor(output_weight), convert2tt_tensor(params['output.dense.bias']), convert2tt_tensor(params['output.LayerNorm.weight']), convert2tt_tensor(params['output.LayerNorm.bias']), attention.self.num_attention_heads) return att
def __init__(self, config): super(GeneratingMasksAC, self).__init__(config) if config.model_type == 'bert': self.bert = BertModel(config=config) else: self.bert = None # Reload config (Since it's bert.., I think there is a way to modify # this more simple) if self.bert is not None: config = BertConfig.from_pretrained("bert-base-uncased") config.attention_probs_dropout_prob = 0.0 config.hidden_dropout_prob = 0.0 self.config = config self.transformer = BertAttention(config) self.policy1 = nn.Linear(config.hidden_size, 128) self.policy2 = nn.Linear(128, 1) # Value Part # self.value1 = nn.Linear(config.hidden_size, 128) self.value2 = nn.Linear(128, 1) #self.apply(self._init_weights) self.init_weights()
def init_data(self, use_cuda): test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) self.cfg.output_attentions = True torch_attention = BertAttention(self.cfg) torch_attention.eval() if use_cuda: torch_attention.to(test_device) # Get FT Attention turbo_attention = turbo_transformers.BertAttention.from_torch( torch_attention) turbo_decoder_attention = turbo_transformers.MultiHeadedAttention.from_torch( torch_attention, is_trans_weight=False) hidden_size = self.cfg.hidden_size input_tensor = torch.rand(size=(batch_size, seq_length, hidden_size), dtype=torch.float32, device=test_device) attention_mask = torch.ones((batch_size, seq_length), dtype=torch.float32, device=test_device) attention_mask = attention_mask[:, None, None, :] attention_mask = (1.0 - attention_mask) * -10000.0 return torch_attention, turbo_attention, turbo_decoder_attention, input_tensor, attention_mask
def __init__(self, config): super(BertLayerOracleSparse, self).__init__() logger.info( f"Set Oracle Sparse with key_c:{config.key_c} and query_c:{config.query_c}!" ) self.attention = BertAttention(config) self.attention.self.output_attentions = True self.intermediate = BertIntermediate(config) self.output = BertOutput(config) self.key_c = config.key_c self.query_c = config.query_c self.num_heads = config.num_attention_heads
def init_attn_models(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) # torch model is from ONMT # self.torch_model = MultiHeadedAttention(self.cfg.num_attention_heads, self.cfg.hidden_size) self.torch_model = BertAttention(self.cfg) self.torch_model.eval() if use_cuda: self.torch_model.to(self.test_device) self.hidden_size = self.cfg.hidden_size # self.turbo_model = turbo_transformers.MultiHeadedAttentionSmartBatch.from_onmt( # self.torch_model) self.turbo_model = turbo_transformers.MultiHeadedAttentionSmartBatch.from_torch( self.torch_model)
def init_bert_models(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) self.torch_model = BertModel(self.cfg) self.torch_model.eval() if use_cuda: self.torch_model.to(self.test_device) self.hidden_size = self.cfg.hidden_size self.turbo_model = turbo_transformers.BertModelSmartBatch.from_torch( self.torch_model)
class TestBertSmartBatch(unittest.TestCase): def init_bertlayer_models(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) self.torch_model = BertLayer(self.cfg) self.torch_model.eval() if use_cuda: self.torch_model.to(self.test_device) self.hidden_size = self.cfg.hidden_size self.turbo_model = turbo_transformers.BertLayerSmartBatch.from_torch( self.torch_model) def init_bert_models(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) self.torch_model = BertModel(self.cfg) self.torch_model.eval() if use_cuda: self.torch_model.to(self.test_device) self.hidden_size = self.cfg.hidden_size self.turbo_model = turbo_transformers.BertModelSmartBatch.from_torch( self.torch_model) def init_attn_models(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) # torch model is from ONMT # self.torch_model = MultiHeadedAttention(self.cfg.num_attention_heads, self.cfg.hidden_size) self.torch_model = BertAttention(self.cfg) self.torch_model.eval() if use_cuda: self.torch_model.to(self.test_device) self.hidden_size = self.cfg.hidden_size # self.turbo_model = turbo_transformers.MultiHeadedAttentionSmartBatch.from_onmt( # self.torch_model) self.turbo_model = turbo_transformers.MultiHeadedAttentionSmartBatch.from_torch( self.torch_model) def init_inputs(self): # prepare torch input data self.input_list = [] for query_seq_len in query_seq_len_list: Q = torch.rand( size=( 1, query_seq_len, #from_seq self.hidden_size), dtype=torch.float32, device=self.test_device) self.input_list.append(Q) # concat Qs together for i in range(len(query_seq_len_list)): if i == 0: self.concat_Q = self.input_list[i] else: self.concat_Q = torch.cat( (self.concat_Q, self.input_list[i]), 1) self.assertTrue(self.concat_Q.size()[1] == sum(query_seq_len_list)) def init_inputs_seq(self): # prepare torch input data self.input_list = [] for query_seq_len in query_seq_len_list: input_seq = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(1, query_seq_len), dtype=torch.long, device=self.test_device) self.input_list.append(input_seq) # self.assertTrue(self.concat_Q.size()[1] == sum(query_seq_len_list)) def check_bert_attn(self, use_cuda): self.init_attn_models(use_cuda) self.init_inputs() num_iter = 2 device = "GPU" if use_cuda else "CPU" res_list = [] for Q in self.input_list: # res, _ = self.torch_model( # Q, # Q, # Q, # mask=None, # layer_cache=None, #layer_cache_torch # attn_type="self") # res_list.append(res) attention_mask = torch.ones((1, Q.size(1)), dtype=torch.float32, device=self.test_device) attention_mask = attention_mask[:, None, None, :] attention_mask = (1.0 - attention_mask) * -10000.0 res = self.torch_model(Q, attention_mask=None) res_list.append(res[0]) # concat res_list together for i in range(len(res_list)): if i == 0: concat_res = res_list[i] else: concat_res = torch.cat((concat_res, res_list[i]), 1) pad_result, _ = self.turbo_model(self.concat_Q, self.concat_Q, self.concat_Q, query_seq_len_list, [], mask=None, layer_cache=None, post_layernorm=True, attn_type="self") # Tensor core will introduce more errors tolerate_error = 1e-2 if use_cuda else 1e-3 self.assertTrue( torch.max(torch.abs(concat_res - pad_result)) < tolerate_error) def check_bert_layer(self, use_cuda): self.init_bertlayer_models(use_cuda) self.init_inputs() num_iter = 2 device = "GPU" if use_cuda else "CPU" res_list = [] for Q in self.input_list: res, _ = self.torch_model(Q, None, output_attentions=True) res_list.append(res) # concat res_list together for i in range(len(res_list)): if i == 0: concat_res = res_list[i] else: concat_res = torch.cat((concat_res, res_list[i]), 1) pad_result, _ = self.turbo_model(self.concat_Q, query_seq_len_list, attention_mask=None, output_attentions=False) # Tensor core will introduce more errors tolerate_error = 1e-2 if use_cuda else 1e-3 self.assertTrue( torch.max(torch.abs(concat_res - pad_result)) < tolerate_error) # self.assertTrue( # torch.max( # torch.abs(torch_bert_layer_result[1] - # turbo_bert_layer_result[1])) < tolerate_error) # with open(fname, "a") as fh: # fh.write( # f"\"({batch_size},{seq_length:03})\", {torch_qps}, {turbo_qps}\n" # ) def check_bert_model(self, use_cuda): self.init_bert_models(use_cuda) self.init_inputs_seq() num_iter = 2 device = "GPU" if use_cuda else "CPU" # for reference res_list = [] for Q in self.input_list: res, _ = self.torch_model(Q) res_list.append(res) for i in range(len(res_list)): if i == 0: concat_res = res_list[i] else: concat_res = torch.cat((concat_res, res_list[i]), 1) # turbo inference pad_result, _ = self.turbo_model(self.input_list, query_seq_len_list) # Tensor core will introduce more errors tolerate_error = 1e-2 if use_cuda else 1e-3 self.assertTrue( torch.max(torch.abs(concat_res - pad_result)) < tolerate_error) # self.assertTrue( # torch.max( # torch.abs(torch_bert_layer_result[1] - # turbo_bert_layer_result[1])) < tolerate_error) # with open(fname, "a") as fh: # fh.write( # f"\"({batch_size},{seq_length:03})\", {torch_qps}, {turbo_qps}\n" # ) def test_bert(self): self.check_bert_model(use_cuda=False) self.check_bert_layer(use_cuda=False) self.check_bert_attn(use_cuda=False) if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_bert_model(use_cuda=True) self.check_bert_layer(use_cuda=True) self.check_bert_attn(use_cuda=True)
def __init__(self, config): super(BertScanLayer, self).__init__() self.attention = BertAttention(config) self.scan_attention = BertScanAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config)
def from_torch(attention: TorchBertAttention, layer_norm: Optional[TorchLayerNorm] = None, is_trans_weight: bool = False): """ load an attn model from huggingface bert attention model. """ ln_params = {} if layer_norm is not None: ln_params = {k: v for k, v in layer_norm.named_parameters()} params = {k: v for k, v in attention.named_parameters()} with torch.no_grad(): if is_trans_weight: # merge self.query.weight, self.query.weight and self.query.weight together as qkv.weight qkv_weight = torch.cat( (params['self.query.weight'], params['self.key.weight'], params['self.value.weight']), 0) output_weight = params['output.dense.weight'] k_w = params['self.key.weight'] v_w = params['self.value.weight'] q_w = params['self.query.weight'] else: # merge self.query.weight, self.query.weight and self.query.weight together as qkv.weight qkv_weight = torch.clone( torch.t( torch.cat((params['self.query.weight'], params['self.key.weight'], params['self.value.weight']), 0).contiguous()).contiguous()) output_weight = torch.clone( torch.t(params['output.dense.weight']).contiguous()) k_w = torch.clone( torch.t(params['self.key.weight']).contiguous()) v_w = torch.clone( torch.t(params['self.value.weight']).contiguous()) q_w = torch.clone( torch.t(params['self.query.weight']).contiguous()) qkv_bias = torch.cat( (params['self.query.bias'], params['self.key.bias'], params['self.value.bias']), 0) if layer_norm is not None: att = MultiHeadedAttention( convert2tt_tensor(k_w), convert2tt_tensor(params['self.key.bias']), convert2tt_tensor(v_w), convert2tt_tensor(params['self.value.bias']), convert2tt_tensor(q_w), convert2tt_tensor(params['self.query.bias']), convert2tt_tensor(output_weight), convert2tt_tensor(params['output.dense.bias']), convert2tt_tensor(qkv_weight), convert2tt_tensor(qkv_bias), convert2tt_tensor(params['output.LayerNorm.weight']), convert2tt_tensor(params['output.LayerNorm.bias']), convert2tt_tensor(ln_params['weight']), convert2tt_tensor(ln_params['bias']), attention.self.num_attention_heads) else: att = MultiHeadedAttention( convert2tt_tensor(k_w), convert2tt_tensor(params['self.key.bias']), convert2tt_tensor(v_w), convert2tt_tensor(params['self.value.bias']), convert2tt_tensor(q_w), convert2tt_tensor(params['self.query.bias']), convert2tt_tensor(output_weight), convert2tt_tensor(params['output.dense.bias']), convert2tt_tensor(qkv_weight), convert2tt_tensor(qkv_bias), convert2tt_tensor(params['output.LayerNorm.weight']), convert2tt_tensor(params['output.LayerNorm.bias']), attention.self.num_attention_heads) return att