def from_torch(model: TorchBertModel, device: Optional[torch.device] = None): if device is not None and 'cuda' in device.type and torch.cuda.is_available( ): model.to(device) embeddings = BertEmbeddings.from_torch(model.embeddings) encoder = BertEncoder.from_torch(model.encoder) return BertModelNoPooler(embeddings, encoder)
def from_torch( model: TorchBertModel, device: Optional[torch.device] = None # from_torch函数实现 ): if device is not None and "cuda" in device.type and torch.cuda.is_available( ): model.to(device) bertmodel = turbo_transformers.BertModel.from_torch(model.bert) # We can copy the following code and do not change it # Notice: classifier is the class member of BertForSequenceClassification. If user define the other class member, # they need modify it here. return BertForSequenceClassification(bertmodel, model.classifier)
class TestBertModel(unittest.TestCase): def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = BertConfig() self.torch_model = BertModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.BertModel.from_torch( self.torch_model, self.test_device, "turbo") def check_torch_and_turbo(self, use_cuda, batch_size, seq_len, use_memory_opt=True): self.init_data(use_cuda) num_iter = 1 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'BertModel PyTorch({device_name}) QPS {torch_qps}') turbo_model = (lambda: self.turbo_model(input_ids)) if use_memory_opt: turbo_transformers.bert_opt_mem_allocate_api( input_ids.size()[0], # batch input_ids.size()[1], # seq_len self.cfg.num_attention_heads, self.cfg.hidden_size, self.cfg.num_hidden_layers, "GPU" if 'cuda' in input_ids.device.type else "CPU") with turbo_transformers.pref_guard("bert_perf") as perf: turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}') print(f"batch {batch_size} seq_len {seq_len}") print(torch.max(torch_result[0].cpu() - turbo_result[0].cpu())) self.assertTrue( numpy.allclose(torch_result[0].cpu(), turbo_result[0].cpu(), atol=1e-2, rtol=1e-3)) def bert_model_test_helper(self, use_memory_opt=False): if use_memory_opt: turbo_transformers.reset_allocator_schema("model-aware") for batch_size in [2, 4, 1]: for seq_len in [50, 4, 16]: if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True, batch_size=batch_size, seq_len=seq_len, use_memory_opt=use_memory_opt) self.check_torch_and_turbo(use_cuda=False, batch_size=batch_size, seq_len=seq_len, use_memory_opt=use_memory_opt) if use_memory_opt: turbo_transformers.reset_allocator_schema("naive") def test_bert_model(self): # self.bert_model_test_helper(True) self.bert_model_test_helper(False)
def from_torch(model: TorchBertModel, device: Optional[torch.device] = None, backend: Optional[str] = None, use_memory_opt=False): """ Args: model : a PyTorch Bert Model device : cpu or GPU backend : a string to indicates kernel provides Four options. [onnxrt-cpu, onnxrt-gpu, turbo-cpu, turbo-gpu] use_memory_opt [bool] whether or not use memory opt for variable length inputs. """ use_gpu = False if device is None: device = model.device # we may need to move to GPU explicitly if 'cuda' in device.type and torch.cuda.is_available(): model.to(device) if backend is None: backend = "turbo" # On GPU turbo is faster use_gpu = True else: if backend is None: backend = "onnxrt" # On CPU onnxrt is faster if backend == "turbo": embeddings = BertEmbeddings.from_torch(model.embeddings) encoder = BertEncoder.from_torch(model.encoder) bertmodel_nopooler = BertModelNoPooler(embeddings, encoder) pooler = BertPooler.from_torch(model.pooler) return BertModel(bertmodel_nopooler, pooler, "turbo", model.config) elif backend == "onnxrt": import onnx import onnxruntime import onnxruntime.backend inputs = { 'input_ids': torch.randint(32, [2, 32], dtype=torch.long).to( device), # list of numerical ids for the tokenised text 'attention_mask': torch.ones([2, 32], dtype=torch.long).to(device), # dummy list of ones 'token_type_ids': torch.ones([2, 32], dtype=torch.long).to(device), # dummy list of ones } onnx_model_path = "/tmp/temp_turbo_onnx.model" with open(onnx_model_path, 'wb') as outf: torch.onnx.export( model=model, args=(inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids'] ), # model input (or a tuple for multiple inputs) f=outf, input_names=[ 'input_ids', 'attention_mask', 'token_type_ids' ], opset_version=11, # the ONNX version to export the model to do_constant_folding= True, # whether to execute constant folding for optimization output_names=['output'], dynamic_axes={ 'input_ids': [0, 1], 'attention_mask': [0, 1], 'token_type_ids': [0, 1] }) # num_threads = "8" # os.environ['OMP_NUM_THREADS'] = str(num_threads) # os.environ['MKL_NUM_THREADS'] = str(num_threads) onnx_model = onnx.load_model(f=onnx_model_path) onnx_model = onnxruntime.backend.prepare( model=onnx_model, device='GPU' if use_gpu else "CPU", graph_optimization_level=onnxruntime.GraphOptimizationLevel. ORT_ENABLE_ALL) return BertModel(onnx_model, None, "onnxrt")