def check_torch_and_turbo(self, use_cuda, num_iter=1): self.init_data(use_cuda) device = "GPU" if use_cuda else "CPU" torch_model = lambda: self.output_layer_norm( self.torch_ffn(self.inputs) + self.inputs) torch_res, torch_qps, torch_time_consume = \ test_helper.run_model(torch_model, use_cuda, num_iter) print( f"DistrillFFN \"({batch_size}, {input_len:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}") turbo_res = lambda: self.turbo_ffn(self.inputs, is_trans_weight=True) with turbo_transformers.pref_guard("gpref_test") as perf: turbo_res, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_res, use_cuda, num_iter) print( f"DistrillFFN \"({batch_size}, {input_len:03})\" ", f"{device} Turbo Trans QPS, {turbo_qps}, time, {turbo_time_consume}" ) print(torch.max(torch.abs(torch_res - turbo_res))) self.assertTrue(torch.max(torch.abs(torch_res - turbo_res)) < 1e-3) with open(fname, "a") as fh: fh.write( f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps}\n" )
def check_torch_and_turbo(self, use_cuda, num_iter=1): deivce_type = "GPU" if use_cuda else "CPU" info = f"\"({deivce_type}, {batch_size}, {src_length}, {T})\"" self.init_data(use_cuda=use_cuda) self.inputs = torch.rand( batch_size, src_length, self.model_dim, dtype=torch.float32, device=self.test_device) self.mask = torch.randint(-100, 0, (batch_size, 1, src_length), dtype=torch.int64, device=self.test_device) onmt_mask = self.mask > 0 onmt_model = lambda: self.onmt_encoder(self.inputs, onmt_mask) onmt_result, torch_qps, torch_time_consume = \ test_helper.run_model(onmt_model, use_cuda, num_iter) print( f"ONMT Encoder {info} ", f"{deivce_type} QPS, {torch_qps}, time, {torch_time_consume}") if with_quantize_dynamic and not use_cuda: quantized_onmt_model = lambda: self.quantized_onmt_encoder( self.inputs,onmt_mask) quantized_onmt_result, quantized_torch_qps, quantized_torch_time_consume = \ test_helper.run_model(quantized_onmt_model, use_cuda, num_iter) print( f"ONMT Quantized Encoder {info} ", f"{deivce_type} QPS, {quantized_torch_qps}, time, {quantized_torch_time_consume}" ) turbo_model = lambda: self.turbo_encoder(self.inputs, onmt_mask) with turbo_transformers.pref_guard(info) as perf: turbo_result, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"Turbo Encoder {info} ", f"{deivce_type} QPS, {turbo_qps}, time, {turbo_time_consume}") print(f"diff max {torch.max(torch.abs(onmt_result - turbo_result))}") self.assertTrue( torch.max(torch.abs(onmt_result - turbo_result)) < (1e-3 if use_cuda else 1e-4)) if with_quantize_dynamic and not use_cuda: with open(fname, "a") as fh: fh.write( f"{info} {torch_qps}, {quantized_torch_qps}, {turbo_qps}\n" ) else: with open(fname, "a") as fh: fh.write(f"{info} {torch_qps}, {turbo_qps}\n")
def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda) num_iter = 1 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(1, 10), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'BertModel PyTorch({device_name}) QPS {torch_qps}') turbo_model = (lambda: self.turbo_model(input_ids)) with turbo_transformers.pref_guard("bert_perf") as perf: turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}') self.assertTrue( numpy.allclose(torch_result[0][:, 0].cpu(), turbo_result[0].cpu(), atol=1e-3, rtol=1e-3))
def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda=use_cuda) device = "GPU" if use_cuda else "CPU" num_iter = 1 turbo_model = lambda: self.turbo_model( self.input_tensor, attention_mask=None, head_mask=None) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"AlbertLayer \"({batch_size},{seq_length:03})\" ", f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}" ) torch_model = lambda: self.torch_model(input_ids=self.input_tensor, attention_mask=None, head_mask=None) with turbo_transformers.pref_guard("albert_perf") as perf: torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f"AlbertModel \"({batch_size},{seq_length:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time}") # print(turbo_result[-1]) # print(turbo_result, torch_result[0]) # TODO(jiaruifang) Error is too high. Does tensor core introduce more differences? tolerate_error = 1e-2 self.assertTrue( torch.max(torch.abs(torch_result[0] - turbo_result[0])) < tolerate_error) with open("albert_model_res.txt", "a") as fh: fh.write( f"\"({batch_size},{seq_length:03})\", {torch_qps}, {torch_qps}\n" )
def check_torch_and_turbo(self, use_cuda, num_iter=1): self.init_data(use_cuda) device = "GPU" if use_cuda else "CPU" torch_model = lambda: self.torch_model(self.inputs, self. attention_mask) torch_res, torch_qps, torch_time_consume = \ test_helper.run_model(torch_model, use_cuda, num_iter) print( f"DistillBertModel \"({batch_size}, {input_len:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}") turbo_res = lambda: self.turbo_transformer( self.inputs, self.attention_mask, head_mask=self.head_mask) with turbo_transformers.pref_guard("gpref_test") as perf: turbo_res, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_res, use_cuda, num_iter) print( f"DistillBertModel \"({batch_size}, {input_len:03})\" ", f"{device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}") self.assertTrue( torch.max(torch.abs(torch_res[0] - turbo_res[0])) < 1e-2 if use_cuda else 1e-3) with open(fname, "a") as fh: fh.write( f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps}\n" )
def check_torch_and_turbo(self, use_cuda, num_iter=1): onmt_ffn, turbo_ffn_trans, turbo_ffn_notrans, inputs = self.init_data( use_cuda) device = "GPU" if use_cuda else "CPU" onmt_model = lambda: onmt_ffn(inputs) onmt_model_result, torch_qps, torch_time_consume = \ test_helper.run_model(onmt_model, use_cuda, num_iter) print( f"PositionwiseFeedForward \"({batch_size}, {input_len:03})\" ", f"{device} ONMT QPS, {torch_qps}, time, {torch_time_consume}") turbo_model_trans = lambda: turbo_ffn_trans(inputs, is_trans_weight=True) with turbo_transformers.pref_guard("gpref_test") as perf: turbo_model_result, turbo_qps_trans, turbo_time_consume_trans = \ test_helper.run_model(turbo_model_trans, use_cuda, num_iter) print( f"PositionwiseFeedForward \"({batch_size}, {input_len:03})\" ", f"{device} Turbo Trans QPS, {turbo_qps_trans}, time, {turbo_time_consume_trans}" ) turbo_model_notrans = lambda: turbo_ffn_notrans( inputs, is_trans_weight=False) with turbo_transformers.pref_guard("gpref_test") as perf: turbo_model_result, turbo_qps_notrans, turbo_time_consume_notrans = \ test_helper.run_model(turbo_model_notrans, use_cuda, num_iter) print( f"PositionwiseFeedForward Notrans \"({batch_size}, {input_len:03})\" ", f"{device} Turbo NoTrans QPS, {turbo_qps_notrans}, time, {turbo_time_consume_notrans}" ) self.assertTrue( torch.max(torch.abs(turbo_model_result - onmt_model_result)) < (1e-3 if use_cuda else 1e-4)) with open(fname, "a") as fh: fh.write( f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps_trans}, {turbo_qps_notrans}\n" )
def check_torch_and_turbo(self, batch_size, seq_length, use_cuda, use_memory_opt): self.init_data(use_cuda=use_cuda) self.input_tensor = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(batch_size, seq_length), device=self.test_device) device = "GPU" if use_cuda else "CPU" num_iter = 1 if use_memory_opt: turbo_transformers.bert_opt_mem_allocate_api( self.input_tensor.size()[0], # batch self.input_tensor.size()[1], # seq_len self.cfg.num_attention_heads, self.cfg.hidden_size, self.cfg.num_hidden_layers, "GPU" if 'cuda' in self.input_tensor.device.type else "CPU") turbo_model = lambda: self.turbo_model( self.input_tensor, attention_mask=None, head_mask=None) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print( f"AlbertLayer \"({batch_size},{seq_length:03})\" ", f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}") torch_model = lambda: self.torch_model( input_ids=self.input_tensor, attention_mask=None, head_mask=None) with turbo_transformers.pref_guard("albert_perf") as perf: torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f"AlbertModel \"({batch_size},{seq_length:03})\" ", f"{device} Torch QPS, {torch_qps}, time, {torch_time}") # print(turbo_result[-1]) # print(turbo_result, torch_result[0]) # TODO(jiaruifang) Error is too high. Does tensor core introduce more differences? tolerate_error = 1e-2 self.assertTrue( torch.max(torch.abs(torch_result[0] - turbo_result[0])) < tolerate_error) with open("albert_model_res.txt", "a") as fh: fh.write( f"\"({batch_size},{seq_length:03})\", {torch_qps}, {torch_qps}\n" )
def check_torch_and_turbo(self, use_cuda, batch_size, seq_len, use_memory_opt=True): self.init_data(use_cuda) num_iter = 1 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'BertModel PyTorch({device_name}) QPS {torch_qps}') turbo_model = (lambda: self.turbo_model(input_ids)) if use_memory_opt: turbo_transformers.bert_opt_mem_allocate_api( input_ids.size()[0], # batch input_ids.size()[1], # seq_len self.cfg.num_attention_heads, self.cfg.hidden_size, self.cfg.num_hidden_layers, "GPU" if 'cuda' in input_ids.device.type else "CPU") with turbo_transformers.pref_guard("bert_perf") as perf: turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}') print(f"batch {batch_size} seq_len {seq_len}") print(torch.max(torch_result[0].cpu() - turbo_result[0].cpu())) self.assertTrue( numpy.allclose(torch_result[0].cpu(), turbo_result[0].cpu(), atol=1e-2, rtol=1e-3))
def check_torch_and_turbo(self, use_cuda, num_iter=1): onmt_multi_headed_attention, torch_layernorm, turbo_attn_trans, turbo_attn_notrans, Q, K, V = \ self.init_data(use_cuda) device = "GPU" if use_cuda else "CPU" info = f"\"({device}, {set_layer_cache}, {pre_layernorm}, {post_add_input}, {attn_type}, {batch_size}, {key_seq_len:03}, {query_seq_len:03})\"" if attn_type == "context": attention_mask = torch.zeros((batch_size, 1, key_seq_len), dtype=torch.bool, device=self.test_device) elif attn_type == "self": attention_mask = None # torch.zeros( # (batch_size, query_seq_len, key_seq_len), # dtype=torch.bool, # device=self.test_device) else: raise "attn type is not supported" # set layer_cache if set_layer_cache: memory_keys = torch.rand(size=(batch_size, self.head_count, key_seq_len, self.size_per_head), dtype=torch.float32, device=self.test_device) memory_values = torch.rand(size=(batch_size, self.head_count, key_seq_len, self.size_per_head), dtype=torch.float32, device=self.test_device) self_keys = torch.rand(size=(batch_size, self.head_count, query_seq_len, self.size_per_head), dtype=torch.float32, device=self.test_device) self_values = torch.rand(size=(batch_size, self.head_count, query_seq_len, self.size_per_head), dtype=torch.float32, device=self.test_device) print("self_keys size: ", self_keys.size()) layer_cache_torch = { "memory_keys": torch.clone(memory_keys), "memory_values": torch.clone(memory_values), "self_keys": torch.clone(self_keys), "self_values": torch.clone(self_values) } else: layer_cache_torch = { "memory_keys": None, "memory_values": None, "self_keys": None, "self_values": None } onmt_model = lambda: onmt_multi_headed_attention( K, V, torch.clone(torch_layernorm(Q)) if pre_layernorm else Q, mask=attention_mask, layer_cache=layer_cache_torch, attn_type=attn_type) onmt_multi_headed_attention_result, torch_qps, torch_time_consume = \ test_helper.run_model(onmt_model, use_cuda, num_iter) # return output, attns onmt_attns = onmt_multi_headed_attention_result[1] if post_add_input: onmt_output = onmt_multi_headed_attention_result[0] + Q else: onmt_output = onmt_multi_headed_attention_result[0] print( f"Multi Headed Attention {info} ONMT, QPS,{torch_qps}, time, {torch_time_consume}" ) if with_quantize_dynamic and not use_cuda: q_onmt_model = lambda: self.q_onmt_multi_headed_attention( K, V, torch.clone(torch_layernorm(Q)) if pre_layernorm else Q, mask=attention_mask, layer_cache=layer_cache_torch, attn_type=attn_type) q_onmt_multi_headed_attention_result, q_torch_qps, q_torch_time_consume = \ test_helper.run_model(q_onmt_model, use_cuda, num_iter) # return output, attns onmt_attns = q_onmt_multi_headed_attention_result[1] if post_add_input: onmt_output = q_onmt_multi_headed_attention_result[0] + Q else: onmt_output = q_onmt_multi_headed_attention_result[0] print( f"Multi Headed Attention {info} Q-ONMT, QPS, {q_torch_qps}, time, {q_torch_time_consume}" ) # benchmarking turbo with weight transposed turbo_attention_mask = attention_mask.float( ) * -1e18 if attention_mask is not None else None if set_layer_cache: layer_cache_turbo = { "memory_keys": torch.clone(memory_keys), "memory_values": torch.clone(memory_values), "self_keys": torch.clone(self_keys), "self_values": torch.clone(self_values) } else: layer_cache_turbo = { "memory_keys": None, "memory_values": None, "self_keys": None, "self_values": None } turbo_model_trans = lambda: turbo_attn_trans( K, V, Q, turbo_attention_mask, layer_cache=layer_cache_turbo, attn_type=attn_type, pre_layernorm=pre_layernorm, post_add_input=post_add_input, is_trans_weight=True) # with turbo_transformers.pref_guard("pref_test") as perf: turbo_result, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_model_trans, use_cuda, num_iter) turbo_output_trans, turbo_attns_trans = turbo_result print( f"Multi Headed Attention {info} Turbo Trans, QPS, {turbo_qps}, time, {turbo_time_consume}" ) self.assertTrue( torch.max(torch.abs(onmt_output - turbo_output_trans)) < ( 1e-3 if use_cuda else 1e-4)) self.assertTrue( torch.max(torch.abs(onmt_attns - turbo_attns_trans)) < ( 1e-3 if use_cuda else 1e-4)) if layer_cache_torch is not None: for k, v in layer_cache_torch.items(): if v is not None: self.assertTrue( torch.max(torch.abs(layer_cache_turbo[k] - v)) < 1e-3) # benchmarking turbo with weight not transposed if set_layer_cache: layer_cache_turbo = { "memory_keys": torch.clone(memory_keys), "memory_values": torch.clone(memory_values), "self_keys": torch.clone(self_keys), "self_values": torch.clone(self_values) } else: layer_cache_turbo = { "memory_keys": None, "memory_values": None, "self_keys": None, "self_values": None } turbo_model_notrans = lambda: turbo_attn_notrans( K, V, Q, turbo_attention_mask, layer_cache=layer_cache_turbo, attn_type=attn_type, pre_layernorm=pre_layernorm, post_add_input=post_add_input, is_trans_weight=False) with turbo_transformers.pref_guard("pref_test") as perf: turbo_result, turbo_qps, turbo_time_consume_notrans = \ test_helper.run_model(turbo_model_notrans, use_cuda, num_iter) turbo_output_notrans, turbo_attns_notrans = turbo_result print( f"Multi Headed Attention {info} Turbo NoTrans, QPS,{turbo_qps}, time, {turbo_time_consume_notrans}" ) self.assertTrue( torch.max(torch.abs(onmt_output - turbo_output_notrans)) < ( 1e-3 if use_cuda else 1e-4)) self.assertTrue( torch.max(torch.abs(onmt_attns - turbo_attns_notrans)) < ( 1e-3 if use_cuda else 1e-4)) if with_quantize_dynamic and not use_cuda: with open(fname, "a") as fh: fh.write( f"{info} {torch_qps}, {q_torch_qps}, {turbo_qps}\n") else: with open(fname, "a") as fh: fh.write(f"{info} {torch_qps}, {turbo_qps}\n")
def check_torch_and_turbo(self, use_cuda, num_iter=1): deivce_type = "GPU" if use_cuda else "CPU" info = f"\"({deivce_type}, {batch_size}, {src_length}, {T})\"" step = 2 self.init_data(use_cuda=use_cuda) self.inputs = torch.rand(batch_size, T, self.model_dim, dtype=torch.float32, device=self.test_device) self.memory_bank = torch.rand(batch_size, src_length, self.model_dim, dtype=torch.float32, device=self.test_device) self.src_pad_mask = torch.zeros(batch_size, 1, src_length, dtype=torch.float32, device=self.test_device).bool() self.tgt_pad_mask = torch.zeros(batch_size, 1, T, dtype=torch.float32, device=self.test_device).bool() onmt_model = lambda: self.onmt_decoder(self.inputs, self.memory_bank, self.src_pad_mask, self.tgt_pad_mask, layer_cache=None, step=step, future=False) onmt_result, torch_qps, torch_time_consume = \ test_helper.run_model(onmt_model, use_cuda, num_iter) onmt_mid, attns, attn_align = onmt_result print( f"ONMT Deocder {info} ", f"{deivce_type} QPS, {torch_qps}, time, {torch_time_consume}") if with_quantize_dynamic and not use_cuda: quantized_onmt_model = lambda: self.quantized_onmt_decoder( self.inputs, self.memory_bank, self.src_pad_mask, self.tgt_pad_mask, layer_cache=None, step=step, future=False) quantized_onmt_result, quantized_torch_qps, quantized_torch_time_consume = \ test_helper.run_model(quantized_onmt_model, use_cuda, num_iter) quantized_onmt_mid, quantized_attns, quantized_attn_align = quantized_onmt_result print( f"ONMT Quantized Deocder {info} ", f"{deivce_type} QPS, {quantized_torch_qps}, time, {quantized_torch_time_consume}" ) # print(onmt_mid) # print(quantized_onmt_mid) # self.assertTrue( # torch.max(torch.abs(onmt_mid - # quantized_onmt_mid)) < (1e-3 if use_cuda else 1e-4)) # self.assertTrue( # torch.max(torch.abs(attns - quantized_attns)) < ( # 1e-3 if use_cuda else 1e-4)) turbo_model = lambda: self.turbo_decoder(self.inputs, self.memory_bank, self.src_pad_mask, self.tgt_pad_mask, layer_cache=None, step=step, future=False) with turbo_transformers.pref_guard(info) as perf: turbo_result, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_model, use_cuda, num_iter) turbo_mid, turbo_attns, _ = turbo_result print( f"Turbo Deocder {info} ", f"{deivce_type} QPS, {turbo_qps}, time, {turbo_time_consume}") self.assertTrue( torch.max(torch.abs(onmt_mid - turbo_mid)) < (1e-3 if use_cuda else 1e-4)) self.assertTrue( torch.max(torch.abs(attns - turbo_attns)) < ( 1e-3 if use_cuda else 1e-4)) if with_quantize_dynamic and not use_cuda: with open(fname, "a") as fh: fh.write( f"{info} {torch_qps}, {quantized_torch_qps}, {turbo_qps}\n" ) else: with open(fname, "a") as fh: fh.write(f"{info} {torch_qps}, {turbo_qps}\n")