def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda) num_iter = 1 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(1, 10), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'GPT2Model PyTorch({device_name}) QPS {torch_qps}') turbo_model = (lambda: self.turbo_model(input_ids)) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'GPT2Model TurboTransformer({device_name}) QPS {turbo_qps}') self.assertTrue( numpy.allclose(torch_result[0].cpu(), turbo_result[0].cpu(), atol=1e-3, rtol=1e-3))
def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda=use_cuda) device = "GPU" if use_cuda else "CPU" num_iter = 1 turbo_model = lambda: self.turbo_model( self.input_tensor, attention_mask=None, head_mask=None) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"AlbertLayer \"({batch_size},{seq_length:03})\" ", f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}" ) torch_model = lambda: self.torch_model(input_ids=self.input_tensor, attention_mask=None, head_mask=None) with turbo_transformers.pref_guard("albert_perf") as perf: torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f"AlbertModel \"({batch_size},{seq_length:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time}") # print(turbo_result[-1]) # print(turbo_result, torch_result[0]) # TODO(jiaruifang) Error is too high. Does tensor core introduce more differences? tolerate_error = 1e-2 self.assertTrue( torch.max(torch.abs(torch_result[0] - turbo_result[0])) < tolerate_error) with open("albert_model_res.txt", "a") as fh: fh.write( f"\"({batch_size},{seq_length:03})\", {torch_qps}, {torch_qps}\n" )
def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda) num_iter = 20 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(1, 10), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'RobertaModel PyTorch({device_name}) QPS {torch_qps}') turbo_model = (lambda: self.turbo_model(input_ids)) with turbo_transformers.pref_guard("roberta_perf") as perf: turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'RobertaModel TurboTransformer({device_name}) QPS {turbo_qps}') torch_result_final = torch_result[0].cpu().numpy() turbo_result_final = turbo_result[0].cpu().numpy() # print(numpy.size(torch_result_final), numpy.size(turbo_result_final)) # print(torch_result_final - turbo_result_final) self.assertTrue( numpy.allclose(torch_result_final, turbo_result_final, atol=1e-3, rtol=1e-3))
def check_torch_and_turbo(self, use_cuda, num_iter=1): self.init_data(use_cuda) device = "GPU" if use_cuda else "CPU" torch_model = lambda: self.torch_sa_layer_norm( self.torch_attention(query=self.input_tensor, key=self.input_tensor, value=self.input_tensor, mask=self.attention_mask, output_attentions=False)[0] + self. input_tensor) torch_attention_result, torch_qps, torch_time_consume = \ test_helper.run_model(torch_model, use_cuda, num_iter, use_profile=False) print( f"DistilAttention+LN \"({batch_size},{seq_length:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}") turbo_model = lambda: self.turbo_attention( self.input_tensor, self.attention_mask, output_attentions=self.cfg.output_attentions)[0] turbo_attention_result, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"DistilAttention \"({batch_size},{seq_length:03})\" ", f" {device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}" ) self.assertTrue( torch.max( torch.abs(torch_attention_result - turbo_attention_result)) < (1e-3 if use_cuda else 1e-4))
def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda) num_iter = 2 device = "GPU" if use_cuda else "CPU" torch_model = lambda: self.torch_bert_layer( self.input_tensor, self.attention_mask) torch_bert_layer_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f"BertLayer \"({batch_size},{seq_length:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time}") turbo_model = lambda: self.turbo_bert_layer( self.input_tensor, self.attention_mask) turbo_bert_layer_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"BertLayer \"({batch_size},{seq_length:03})\" ", f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}" ) # Tensor core will introduce more errors tolerate_error = 1e-2 if use_cuda else 1e-3 self.assertTrue( torch.max( torch.abs(torch_bert_layer_result[0] - turbo_bert_layer_result)) < tolerate_error) with open(fname, "a") as fh: fh.write( f"\"({batch_size},{seq_length:03})\", {torch_qps}, {turbo_qps}\n" )
def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda) sio = io.StringIO() num_iter = 2 device = "GPU" if use_cuda else "CPU" torch_model = lambda: self.torch_bertout(self.intermediate_output, self.attention_output) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'Bert Output Plain PyTorch({device}) QPS {torch_qps}', file=sio) turbo_model = lambda: self.turbo_bertout(self.intermediate_output, self.attention_output) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f'Bert Output Plain TurboTransformer({device}) QPS {turbo_qps}', file=sio) # cuda version precision is lower due to tensor-core self.assertTrue( torch.max(torch.abs(torch_result - turbo_result)) < 1e-2 if use_cuda else 1e-4) sio.seek(0) with open(f"gpu_bert_output_qps_{batch_size}_{seq_length:03}.txt", "w") as of: for line in sio: print(line.strip(), file=of)
def check_torch_and_turbo(self, use_cuda=True): self.init_data(use_cuda=use_cuda) self.num_iter = 2 turbo_bert_layer_result = None turbo_model = lambda: self.turbo_bert_encoder( self.input_tensor, self.attention_mask, output=turbo_bert_layer_result, return_type=turbo_transformers.ReturnType.turbo_transformers) turbo_bert_layer_result, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_model, use_cuda, self.num_iter) print(f"BertEncoder TurboTransform QPS, {turbo_qps}, ", f"Time Cost, {turbo_time_consume}") turbo_bert_layer_result = self.turbo_bert_encoder( self.input_tensor, self.attention_mask) torch_model = lambda: self.torch_encoder_layer( self.input_tensor, self.attention_mask, [None] * self.cfg. num_hidden_layers) torch_bert_layer_result, torch_qps, torch_time_consume = \ test_helper.run_model(torch_model, use_cuda, self.num_iter) print(f"BertEncoder Torch QPS, {torch_qps}, ", f"Time Cost, {torch_time_consume}") diff = torch.abs(torch_bert_layer_result[0] - turbo_bert_layer_result) self.assertTrue(torch.max(diff) < 1e-3)
def check_torch_and_turbo(self, use_cuda, num_iter=2): torch_attention, turbo_attention, input_tensor, attention_mask = \ self.init_data(use_cuda) device = "GPU" if use_cuda else "CPU" torch_model = lambda: torch_attention(input_tensor, attention_mask) torch_attention_result, torch_qps, torch_time_consume = \ test_helper.run_model(torch_model, use_cuda, num_iter) print( f"AlbertAttention \"({batch_size},{seq_length:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}") turob_model = lambda: turbo_attention(input_tensor, attention_mask) turbo_self_attention_result, turbo_qps, turbo_time_consume = \ test_helper.run_model(turob_model, use_cuda, num_iter) print( f"AlbertAttention \"({batch_size},{seq_length:03})\" ", f" {device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}" ) self.assertTrue( torch.max( torch.abs(torch_attention_result[0] - turbo_self_attention_result)) < 1e-3 if use_cuda else 1e-4) with open(fname, "a") as fh: fh.write( f"\"({batch_size},{seq_length:03})\", {torch_qps}, {turbo_qps}\n" )
def check_torch_and_turbo(self, use_cuda, num_iter=1): self.init_data(use_cuda) device = "GPU" if use_cuda else "CPU" torch_model = lambda: self.output_layer_norm( self.torch_ffn(self.inputs) + self.inputs) torch_res, torch_qps, torch_time_consume = \ test_helper.run_model(torch_model, use_cuda, num_iter) print( f"DistrillFFN \"({batch_size}, {input_len:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}") turbo_res = lambda: self.turbo_ffn(self.inputs, is_trans_weight=True) with turbo_transformers.pref_guard("gpref_test") as perf: turbo_res, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_res, use_cuda, num_iter) print( f"DistrillFFN \"({batch_size}, {input_len:03})\" ", f"{device} Turbo Trans QPS, {turbo_qps}, time, {turbo_time_consume}" ) print(torch.max(torch.abs(torch_res - turbo_res))) self.assertTrue(torch.max(torch.abs(torch_res - turbo_res)) < 1e-3) with open(fname, "a") as fh: fh.write( f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps}\n" )
def check_torch_and_turbo(self, use_cuda, num_iter=1): deivce_type = "GPU" if use_cuda else "CPU" info = f"\"({deivce_type}, {batch_size}, {src_length}, {T})\"" self.init_data(use_cuda=use_cuda) self.inputs = torch.rand( batch_size, src_length, self.model_dim, dtype=torch.float32, device=self.test_device) self.mask = torch.randint(-100, 0, (batch_size, 1, src_length), dtype=torch.int64, device=self.test_device) onmt_mask = self.mask > 0 onmt_model = lambda: self.onmt_encoder(self.inputs, onmt_mask) onmt_result, torch_qps, torch_time_consume = \ test_helper.run_model(onmt_model, use_cuda, num_iter) print( f"ONMT Encoder {info} ", f"{deivce_type} QPS, {torch_qps}, time, {torch_time_consume}") if with_quantize_dynamic and not use_cuda: quantized_onmt_model = lambda: self.quantized_onmt_encoder( self.inputs,onmt_mask) quantized_onmt_result, quantized_torch_qps, quantized_torch_time_consume = \ test_helper.run_model(quantized_onmt_model, use_cuda, num_iter) print( f"ONMT Quantized Encoder {info} ", f"{deivce_type} QPS, {quantized_torch_qps}, time, {quantized_torch_time_consume}" ) turbo_model = lambda: self.turbo_encoder(self.inputs, onmt_mask) with turbo_transformers.pref_guard(info) as perf: turbo_result, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"Turbo Encoder {info} ", f"{deivce_type} QPS, {turbo_qps}, time, {turbo_time_consume}") print(f"diff max {torch.max(torch.abs(onmt_result - turbo_result))}") self.assertTrue( torch.max(torch.abs(onmt_result - turbo_result)) < (1e-3 if use_cuda else 1e-4)) if with_quantize_dynamic and not use_cuda: with open(fname, "a") as fh: fh.write( f"{info} {torch_qps}, {quantized_torch_qps}, {turbo_qps}\n" ) else: with open(fname, "a") as fh: fh.write(f"{info} {torch_qps}, {turbo_qps}\n")
def check_torch_and_turbo(self, use_cuda, num_iter=1): self.init_data(use_cuda) device = "GPU" if use_cuda else "CPU" torch_model = lambda: self.torch_model(self.inputs, self. attention_mask) torch_res, torch_qps, torch_time_consume = \ test_helper.run_model(torch_model, use_cuda, num_iter) print( f"DistillBertModel \"({batch_size}, {input_len:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}") turbo_res = lambda: self.turbo_transformer( self.inputs, self.attention_mask, head_mask=self.head_mask) with turbo_transformers.pref_guard("gpref_test") as perf: turbo_res, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_res, use_cuda, num_iter) print( f"DistillBertModel \"({batch_size}, {input_len:03})\" ", f"{device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}") self.assertTrue( torch.max(torch.abs(torch_res[0] - turbo_res[0])) < 1e-2 if use_cuda else 1e-3) with open(fname, "a") as fh: fh.write( f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps}\n" )
def check_torch_and_turbo(self, use_cuda, num_iter=1): torch_attention, turbo_attention, turbo_decoder_attention, input_tensor, attention_mask = \ self.init_data(use_cuda) device = "GPU" if use_cuda else "CPU" torch_model = lambda: torch_attention(input_tensor, attention_mask) torch_attention_result, torch_qps, torch_time_consume = \ test_helper.run_model(torch_model, use_cuda, num_iter) print( f"BertAttention \"({batch_size},{seq_length:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}") turbo_model = lambda: turbo_attention(input_tensor, attention_mask) turbo_attention_result, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"BertAttention \"({batch_size},{seq_length:03})\" ", f" {device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}" ) self.assertTrue( torch.max( torch.abs(torch_attention_result[0] - turbo_attention_result[0])) < ( 1e-3 if use_cuda else 1e-4)) turbo_multiheaded_model = lambda: turbo_decoder_attention( input_tensor, input_tensor, input_tensor, attention_mask, layer_cache=None, attn_type="self", pre_layernorm=False, post_layernorm=True, post_add_input=False, is_trans_weight=False) turbo_decoder_attn_result, turbo_decoder_qps, turbo_decoder_time_consume = \ test_helper.run_model(turbo_multiheaded_model, use_cuda, num_iter, use_profile=False) print( f"MultiHeadedAttention \"({batch_size},{seq_length:03})\" ", f" {device} Turbo QPS, {turbo_decoder_qps}, time, {turbo_decoder_time_consume}" ) self.assertTrue( torch.max( torch.abs(torch_attention_result[0] - turbo_decoder_attn_result[0])) < ( 1e-3 if use_cuda else 1e-4)) with open(fname, "a") as fh: fh.write( f"\"({batch_size},{seq_length:03})\", {torch_qps}, {turbo_qps}\n" )
def check_torch_and_turbo(self, use_cuda=True): self.init_data(use_cuda=use_cuda) self.num_iter = 2 turbo_bert_layer_result = None turbo_model = lambda: self.turbo_bert_encoder(self.input_tensor, self.attention_mask, output_attentions=True, output_hidden_states=True ) turbo_bert_layer_result, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_model, use_cuda, self.num_iter) print(f"BertEncoder TurboTransform QPS, {turbo_qps}, ", f"Time Cost, {turbo_time_consume}") # turbo_bert_layer_result = self.turbo_bert_encoder( # self.input_tensor, # self.attention_mask, # output_attentions = True, # output_hidden_states = False) torch_model = lambda: self.torch_encoder_layer( self.input_tensor, self.attention_mask, [None] * self.cfg.num_hidden_layers, output_attentions=True, output_hidden_states=True) torch_bert_layer_result, torch_qps, torch_time_consume = \ test_helper.run_model(torch_model, use_cuda, self.num_iter) print(f"BertEncoder Torch QPS, {torch_qps}, ", f"Time Cost, {torch_time_consume}") diff = torch.abs(torch_bert_layer_result[0] - turbo_bert_layer_result[0]) self.assertTrue(torch.max(diff) < 1e-2) # Note we did not print the last hidden_states, because it is the same as output # print(len(torch_bert_layer_result[1]), len(turbo_bert_layer_result[1])) for a, b in zip(torch_bert_layer_result[1], turbo_bert_layer_result[1]): diff = torch.abs(a - b) self.assertTrue(torch.max(diff) < 1e-2) for a, b in zip(torch_bert_layer_result[2], turbo_bert_layer_result[2]): diff = torch.abs(a - b) self.assertTrue(torch.max(diff) < 1e-2)
def check_torch_and_turbo(self, batch_size, seq_length, use_cuda, use_memory_opt): self.init_data(use_cuda=use_cuda) self.input_tensor = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(batch_size, seq_length), device=self.test_device) device = "GPU" if use_cuda else "CPU" num_iter = 1 if use_memory_opt: turbo_transformers.bert_opt_mem_allocate_api( self.input_tensor.size()[0], # batch self.input_tensor.size()[1], # seq_len self.cfg.num_attention_heads, self.cfg.hidden_size, self.cfg.num_hidden_layers, "GPU" if 'cuda' in self.input_tensor.device.type else "CPU") turbo_model = lambda: self.turbo_model( self.input_tensor, attention_mask=None, head_mask=None) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"AlbertLayer \"({batch_size},{seq_length:03})\" ", f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}") torch_model = lambda: self.torch_model( input_ids=self.input_tensor, attention_mask=None, head_mask=None) with turbo_transformers.pref_guard("albert_perf") as perf: torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f"AlbertModel \"({batch_size},{seq_length:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time}") # print(turbo_result[-1]) # print(turbo_result, torch_result[0]) # TODO(jiaruifang) Error is too high. Does tensor core introduce more differences? tolerate_error = 1e-2 self.assertTrue( torch.max(torch.abs(torch_result[0] - turbo_result[0])) < tolerate_error) with open("albert_model_res.txt", "a") as fh: fh.write( f"\"({batch_size},{seq_length:03})\", {torch_qps}, {torch_qps}\n" )
def check_torch_and_turbo(self, use_cuda, use_pooler): self.init_data(use_cuda) num_iter = 2 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(2, 32), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'BertModel Plain PyTorch({device_name}) QPS {torch_qps}') turbo_model = ( lambda: self.turbo_pooler_model(input_ids)) if use_pooler else ( lambda: self.turbo_model(input_ids)) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}') torch_result_final = (torch_result[1]).cpu().numpy( ) if use_pooler else torch_result[0][:, 0].cpu().numpy() turbo_result_final = turbo_result[0].cpu().numpy( ) if use_pooler else turbo_result.cpu().numpy() #TODO(jiaruifang, v_cshi) check why pooler introduce more difference if use_pooler: print( "encode output diff: ", numpy.max((torch_result[0][:, 0]).cpu().numpy() - turbo_result[1].cpu().numpy()).reshape(-1)) print( "pooler output diff: ", numpy.max( (turbo_result_final - torch_result_final).reshape(-1))) (atol, rtol) = (1e-2, 1e-2) if use_pooler else (5e-3, 1e-4) self.assertTrue( numpy.allclose(torch_result_final, turbo_result_final, atol=atol, rtol=rtol))
def check_torch_and_turbo(self, use_cuda, batch_size, seq_len, use_memory_opt=True): self.init_data(use_cuda) num_iter = 1 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'BertModel PyTorch({device_name}) QPS {torch_qps}') turbo_model = (lambda: self.turbo_model(input_ids)) if use_memory_opt: turbo_transformers.bert_opt_mem_allocate_api( input_ids.size()[0], # batch input_ids.size()[1], # seq_len self.cfg.num_attention_heads, self.cfg.hidden_size, self.cfg.num_hidden_layers, "GPU" if 'cuda' in input_ids.device.type else "CPU") with turbo_transformers.pref_guard("bert_perf") as perf: turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}') print(f"batch {batch_size} seq_len {seq_len}") print(torch.max(torch_result[0].cpu() - turbo_result[0].cpu())) self.assertTrue( numpy.allclose(torch_result[0].cpu(), turbo_result[0].cpu(), atol=1e-2, rtol=1e-3))
def check_torch_and_turbo(self, use_cuda, num_iter=1): onmt_ffn, turbo_ffn_trans, turbo_ffn_notrans, inputs = self.init_data( use_cuda) device = "GPU" if use_cuda else "CPU" onmt_model = lambda: onmt_ffn(inputs) onmt_model_result, torch_qps, torch_time_consume = \ test_helper.run_model(onmt_model, use_cuda, num_iter) print( f"PositionwiseFeedForward \"({batch_size}, {input_len:03})\" ", f"{device} ONMT QPS, {torch_qps}, time, {torch_time_consume}") turbo_model_trans = lambda: turbo_ffn_trans(inputs, is_trans_weight=True) with turbo_transformers.pref_guard("gpref_test") as perf: turbo_model_result, turbo_qps_trans, turbo_time_consume_trans = \ test_helper.run_model(turbo_model_trans, use_cuda, num_iter) print( f"PositionwiseFeedForward \"({batch_size}, {input_len:03})\" ", f"{device} Turbo Trans QPS, {turbo_qps_trans}, time, {turbo_time_consume_trans}" ) turbo_model_notrans = lambda: turbo_ffn_notrans( inputs, is_trans_weight=False) with turbo_transformers.pref_guard("gpref_test") as perf: turbo_model_result, turbo_qps_notrans, turbo_time_consume_notrans = \ test_helper.run_model(turbo_model_notrans, use_cuda, num_iter) print( f"PositionwiseFeedForward Notrans \"({batch_size}, {input_len:03})\" ", f"{device} Turbo NoTrans QPS, {turbo_qps_notrans}, time, {turbo_time_consume_notrans}" ) self.assertTrue( torch.max(torch.abs(turbo_model_result - onmt_model_result)) < (1e-3 if use_cuda else 1e-4)) with open(fname, "a") as fh: fh.write( f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps_trans}, {turbo_qps_notrans}\n" )
def check_torch_and_turbo(self, use_cuda): input_ids, position_ids, token_type_ids = self.init_data(use_cuda) device = "GPU" if use_cuda else "CPU" num_iter = 100 torch_model = lambda: self.torch_embedding( input_ids, token_type_ids, position_ids) torch_result, torch_qps, torch_time = test_helper.run_model( torch_model, use_cuda, num_iter) print(f"AlbertEmbeddings \"({batch_size},{seq_length:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time}") turbo_model = lambda: self.turbo_embedding(input_ids, position_ids, token_type_ids) turbo_result, turbo_qps, turbo_time = test_helper.run_model( turbo_model, use_cuda, num_iter) print(f"BertEmbeddings \"({batch_size},{seq_length:03})\" ", f"{device} Turbo QPS, {turbo_qps}, time, {turbo_time}") self.assertTrue( torch.max(torch.abs(torch_result - turbo_result)) < 1e-5)
def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda=use_cuda) device = "GPU" if use_cuda else "CPU" num_iter = 2 hidden_size = self.cfg.hidden_size input_tensor = torch.rand(size=(batch_size, 1, hidden_size), dtype=torch.float32, device=self.test_device) torch_model = lambda: self.torch_pooler(input_tensor) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f"BertPooler \"({batch_size},{hidden_size:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time}") turbo_model = lambda: self.turbo_pooler( input_tensor.reshape((batch_size, hidden_size))) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"BertPooler \"({batch_size}, {hidden_size}\" ", f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}" ) torch_result = torch_result.cpu().numpy() turbo_result = turbo_result.cpu().numpy() self.assertTrue( numpy.allclose(torch_result, turbo_result, rtol=1e-4, atol=1e-3)) with open("bert_pooler_res.txt", "a") as fh: fh.write( f"\"({batch_size},{hidden_size:03})\", {torch_qps}, {torch_qps}\n" )
def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda) num_iter = 2 device = "GPU" if use_cuda else "CPU" torch_model = lambda: self.torch_bertout(self.intermediate_output, self.attention_output) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'Bert Output Plain PyTorch({device}) QPS {torch_qps}') turbo_model = lambda: self.turbo_bertout(self.intermediate_output, self.attention_output) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f'Bert Output Plain TurboTransformer({device}) QPS {turbo_qps}' ) # cuda version precision is lower due to tensor-core self.assertTrue( torch.max(torch.abs(torch_result - turbo_result)) < 1e-2 if use_cuda else 1e-4)
def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda=use_cuda) device = "GPU" if use_cuda else "CPU" num_iter = 2 turbo_model = lambda: self.turbo_layer( self.input_tensor, self.attention_mask, output_attentions=True) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"AlbertLayer \"({batch_size},{seq_length:03})\" ", f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}" ) torch_model = lambda: self.torch_layer( self.input_tensor, self.attention_mask, output_attentions=True) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f"AlbertLayer \"({batch_size},{seq_length:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time}") # print(turbo_result - torch_result[0]) # TODO(jiaruifang) Error is too high. Does tensor core introduce more differences? cpu_tolerate_error = 1e-5 gpu_tolerate_error = 1e-3 self.assertTrue( torch.max(torch.abs(torch_result[0] - turbo_result[0])) < gpu_tolerate_error if use_cuda else cpu_tolerate_error) self.assertTrue( torch.max(torch.abs(torch_result[1] - turbo_result[1])) < gpu_tolerate_error if use_cuda else cpu_tolerate_error) with open("albert_layer_res.txt", "a") as fh: fh.write( f"\"({batch_size},{seq_length:03})\", {torch_qps}, {torch_qps}\n" )
def check_torch_and_turbo(self, use_cuda, num_iter=1): deivce_type = "GPU" if use_cuda else "CPU" info = f"\"({deivce_type}, {batch_size}, {src_length}, {T})\"" step = 2 self.init_data(use_cuda=use_cuda) self.inputs = torch.rand(batch_size, T, self.model_dim, dtype=torch.float32, device=self.test_device) self.memory_bank = torch.rand(batch_size, src_length, self.model_dim, dtype=torch.float32, device=self.test_device) self.src_pad_mask = torch.zeros(batch_size, 1, src_length, dtype=torch.float32, device=self.test_device).bool() self.tgt_pad_mask = torch.zeros(batch_size, 1, T, dtype=torch.float32, device=self.test_device).bool() onmt_model = lambda: self.onmt_decoder(self.inputs, self.memory_bank, self.src_pad_mask, self.tgt_pad_mask, layer_cache=None, step=step, future=False) onmt_result, torch_qps, torch_time_consume = \ test_helper.run_model(onmt_model, use_cuda, num_iter) onmt_mid, attns, attn_align = onmt_result print( f"ONMT Deocder {info} ", f"{deivce_type} QPS, {torch_qps}, time, {torch_time_consume}") if with_quantize_dynamic and not use_cuda: quantized_onmt_model = lambda: self.quantized_onmt_decoder( self.inputs, self.memory_bank, self.src_pad_mask, self.tgt_pad_mask, layer_cache=None, step=step, future=False) quantized_onmt_result, quantized_torch_qps, quantized_torch_time_consume = \ test_helper.run_model(quantized_onmt_model, use_cuda, num_iter) quantized_onmt_mid, quantized_attns, quantized_attn_align = quantized_onmt_result print( f"ONMT Quantized Deocder {info} ", f"{deivce_type} QPS, {quantized_torch_qps}, time, {quantized_torch_time_consume}" ) # print(onmt_mid) # print(quantized_onmt_mid) # self.assertTrue( # torch.max(torch.abs(onmt_mid - # quantized_onmt_mid)) < (1e-3 if use_cuda else 1e-4)) # self.assertTrue( # torch.max(torch.abs(attns - quantized_attns)) < ( # 1e-3 if use_cuda else 1e-4)) turbo_model = lambda: self.turbo_decoder(self.inputs, self.memory_bank, self.src_pad_mask, self.tgt_pad_mask, layer_cache=None, step=step, future=False) with turbo_transformers.pref_guard(info) as perf: turbo_result, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_model, use_cuda, num_iter) turbo_mid, turbo_attns, _ = turbo_result print( f"Turbo Deocder {info} ", f"{deivce_type} QPS, {turbo_qps}, time, {turbo_time_consume}") self.assertTrue( torch.max(torch.abs(onmt_mid - turbo_mid)) < (1e-3 if use_cuda else 1e-4)) self.assertTrue( torch.max(torch.abs(attns - turbo_attns)) < ( 1e-3 if use_cuda else 1e-4)) if with_quantize_dynamic and not use_cuda: with open(fname, "a") as fh: fh.write( f"{info} {torch_qps}, {quantized_torch_qps}, {turbo_qps}\n" ) else: with open(fname, "a") as fh: fh.write(f"{info} {torch_qps}, {turbo_qps}\n")
def check_torch_and_turbo(self, use_cuda, num_iter=1): onmt_multi_headed_attention, torch_layernorm, turbo_attn_trans, turbo_attn_notrans, Q, K, V = \ self.init_data(use_cuda) device = "GPU" if use_cuda else "CPU" info = f"\"({device}, {set_layer_cache}, {pre_layernorm}, {post_add_input}, {attn_type}, {batch_size}, {key_seq_len:03}, {query_seq_len:03})\"" if attn_type == "context": attention_mask = torch.zeros((batch_size, 1, key_seq_len), dtype=torch.bool, device=self.test_device) elif attn_type == "self": attention_mask = None # torch.zeros( # (batch_size, query_seq_len, key_seq_len), # dtype=torch.bool, # device=self.test_device) else: raise "attn type is not supported" # set layer_cache if set_layer_cache: memory_keys = torch.rand(size=(batch_size, self.head_count, key_seq_len, self.size_per_head), dtype=torch.float32, device=self.test_device) memory_values = torch.rand(size=(batch_size, self.head_count, key_seq_len, self.size_per_head), dtype=torch.float32, device=self.test_device) self_keys = torch.rand(size=(batch_size, self.head_count, query_seq_len, self.size_per_head), dtype=torch.float32, device=self.test_device) self_values = torch.rand(size=(batch_size, self.head_count, query_seq_len, self.size_per_head), dtype=torch.float32, device=self.test_device) print("self_keys size: ", self_keys.size()) layer_cache_torch = { "memory_keys": torch.clone(memory_keys), "memory_values": torch.clone(memory_values), "self_keys": torch.clone(self_keys), "self_values": torch.clone(self_values) } else: layer_cache_torch = { "memory_keys": None, "memory_values": None, "self_keys": None, "self_values": None } onmt_model = lambda: onmt_multi_headed_attention( K, V, torch.clone(torch_layernorm(Q)) if pre_layernorm else Q, mask=attention_mask, layer_cache=layer_cache_torch, attn_type=attn_type) onmt_multi_headed_attention_result, torch_qps, torch_time_consume = \ test_helper.run_model(onmt_model, use_cuda, num_iter) # return output, attns onmt_attns = onmt_multi_headed_attention_result[1] if post_add_input: onmt_output = onmt_multi_headed_attention_result[0] + Q else: onmt_output = onmt_multi_headed_attention_result[0] print( f"Multi Headed Attention {info} ONMT, QPS,{torch_qps}, time, {torch_time_consume}" ) if with_quantize_dynamic and not use_cuda: q_onmt_model = lambda: self.q_onmt_multi_headed_attention( K, V, torch.clone(torch_layernorm(Q)) if pre_layernorm else Q, mask=attention_mask, layer_cache=layer_cache_torch, attn_type=attn_type) q_onmt_multi_headed_attention_result, q_torch_qps, q_torch_time_consume = \ test_helper.run_model(q_onmt_model, use_cuda, num_iter) # return output, attns onmt_attns = q_onmt_multi_headed_attention_result[1] if post_add_input: onmt_output = q_onmt_multi_headed_attention_result[0] + Q else: onmt_output = q_onmt_multi_headed_attention_result[0] print( f"Multi Headed Attention {info} Q-ONMT, QPS, {q_torch_qps}, time, {q_torch_time_consume}" ) # benchmarking turbo with weight transposed turbo_attention_mask = attention_mask.float( ) * -1e18 if attention_mask is not None else None if set_layer_cache: layer_cache_turbo = { "memory_keys": torch.clone(memory_keys), "memory_values": torch.clone(memory_values), "self_keys": torch.clone(self_keys), "self_values": torch.clone(self_values) } else: layer_cache_turbo = { "memory_keys": None, "memory_values": None, "self_keys": None, "self_values": None } turbo_model_trans = lambda: turbo_attn_trans( K, V, Q, turbo_attention_mask, layer_cache=layer_cache_turbo, attn_type=attn_type, pre_layernorm=pre_layernorm, post_add_input=post_add_input, is_trans_weight=True) # with turbo_transformers.pref_guard("pref_test") as perf: turbo_result, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_model_trans, use_cuda, num_iter) turbo_output_trans, turbo_attns_trans = turbo_result print( f"Multi Headed Attention {info} Turbo Trans, QPS, {turbo_qps}, time, {turbo_time_consume}" ) self.assertTrue( torch.max(torch.abs(onmt_output - turbo_output_trans)) < ( 1e-3 if use_cuda else 1e-4)) self.assertTrue( torch.max(torch.abs(onmt_attns - turbo_attns_trans)) < ( 1e-3 if use_cuda else 1e-4)) if layer_cache_torch is not None: for k, v in layer_cache_torch.items(): if v is not None: self.assertTrue( torch.max(torch.abs(layer_cache_turbo[k] - v)) < 1e-3) # benchmarking turbo with weight not transposed if set_layer_cache: layer_cache_turbo = { "memory_keys": torch.clone(memory_keys), "memory_values": torch.clone(memory_values), "self_keys": torch.clone(self_keys), "self_values": torch.clone(self_values) } else: layer_cache_turbo = { "memory_keys": None, "memory_values": None, "self_keys": None, "self_values": None } turbo_model_notrans = lambda: turbo_attn_notrans( K, V, Q, turbo_attention_mask, layer_cache=layer_cache_turbo, attn_type=attn_type, pre_layernorm=pre_layernorm, post_add_input=post_add_input, is_trans_weight=False) with turbo_transformers.pref_guard("pref_test") as perf: turbo_result, turbo_qps, turbo_time_consume_notrans = \ test_helper.run_model(turbo_model_notrans, use_cuda, num_iter) turbo_output_notrans, turbo_attns_notrans = turbo_result print( f"Multi Headed Attention {info} Turbo NoTrans, QPS,{turbo_qps}, time, {turbo_time_consume_notrans}" ) self.assertTrue( torch.max(torch.abs(onmt_output - turbo_output_notrans)) < ( 1e-3 if use_cuda else 1e-4)) self.assertTrue( torch.max(torch.abs(onmt_attns - turbo_attns_notrans)) < ( 1e-3 if use_cuda else 1e-4)) if with_quantize_dynamic and not use_cuda: with open(fname, "a") as fh: fh.write( f"{info} {torch_qps}, {q_torch_qps}, {turbo_qps}\n") else: with open(fname, "a") as fh: fh.write(f"{info} {torch_qps}, {turbo_qps}\n")