def from_torch(model: TorchBertModel, device: Optional[torch.device] = None): if device is not None and 'cuda' in device.type and torch.cuda.is_available( ): model.to(device) embeddings = BertEmbeddings.from_torch(model.embeddings) encoder = BertEncoder.from_torch(model.encoder) return BertModelNoPooler(embeddings, encoder)
def from_torch( model: TorchBertModel, # from_torch函数实现 device: Optional[torch.device] = None): if device is not None and 'cuda' in device.type and torch.cuda.is_available( ): model.to(device) bertmodel = turbo_transformers.BertModel.from_torch(model.bert) # We can copy the following code and do not change it # Notice: classifier is the class member of BertForSequenceClassification. If user define the other class member, # they need modify it here. return BertForSequenceClassification(bertmodel, model.classifier)
class BertMultiTask: def __init__(self, job_config, use_pretrain, tokenizer, cache_dir, device, write_log, summary_writer): self.job_config = job_config if not use_pretrain: model_config = self.job_config.get_model_config() bert_config = BertConfig(**model_config) bert_config.vocab_size = len(tokenizer.vocab) self.bert_encoder = BertModel(bert_config) # Use pretrained bert weights else: self.bert_encoder = BertModel.from_pretrained(self.job_config.get_model_file_type()) bert_config = self.bert_encoder.config self.bert_encoder.to(device) self.network=MTLRouting(self.bert_encoder, write_log = write_log, summary_writer = summary_writer) #config_data=self.config['data'] loss_calculation = BertPretrainingLoss(self.bert_encoder, bert_config) loss_calculation.to(device) # Pretrain Dataset self.network.register_batch(BatchType.PRETRAIN_BATCH, "pretrain_dataset", loss_calculation=loss_calculation) self.device=device # self.network = self.network.float() # print(f"Bert ID: {id(self.bert_encoder)} from GPU: {dist.get_rank()}") def save(self, filename: str): network=self.network.module return torch.save(network.state_dict(), filename) def load(self, model_state_dict: str): return self.network.module.load_state_dict(torch.load(model_state_dict, map_location=lambda storage, loc: storage)) def move_batch(self, batch, non_blocking=False): return batch.to(self.device, non_blocking) def eval(self): self.network.eval() def train(self): self.network.train() def save_bert(self, filename: str): return torch.save(self.bert_encoder.state_dict(), filename) def to(self, device): assert isinstance(device, torch.device) self.network.to(device) def half(self): self.network.half()
class TestBertModel(unittest.TestCase): def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = BertConfig() self.torch_model = BertModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.BertModel.from_torch( self.torch_model, self.test_device) def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda) num_iter = 1 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(1, 10), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'BertModel PyTorch({device_name}) QPS {torch_qps}') turbo_model = (lambda: self.turbo_model(input_ids)) with turbo_transformers.pref_guard("bert_perf") as perf: turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}') self.assertTrue( numpy.allclose(torch_result[0][:, 0].cpu(), turbo_result[0].cpu(), atol=1e-3, rtol=1e-3)) def test_bert_model(self): if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True) self.check_torch_and_turbo(use_cuda=False)
class BertEmbed: def __init__(self): config = BertConfig.from_json_file(join(BERT_PATH, 'bert_config.json')) self.tokenizer = BertTokenizer(vocab_file=join(BERT_PATH, 'vocab.txt')) self.model = BertModel(config, add_pooling_layer=False) load_tf_weights_in_bert(self.model, tf_checkpoint_path=join( BERT_PATH, 'bert_model.ckpt'), strip_bert=True) self.model.to(PT_DEVICE) self.model.eval() def get_embedding(self, sentences): x = self.tokenizer(sentences, return_tensors='pt', padding=True).to(PT_DEVICE) with torch.no_grad(): output = self.model(**x)[0] return output.cpu().numpy()
def from_torch(model: TorchBertModel, device: Optional[torch.device] = None, backend: Optional[str] = None, use_memory_opt=False): """ Args: model : a PyTorch Bert Model device : cpu or GPU backend : a string to indicates kernel provides Four options. [onnxrt-cpu, onnxrt-gpu, turbo-cpu, turbo-gpu] use_memory_opt [bool] whether or not use memory opt for variable length inputs. """ use_gpu = False if device is None: device = model.device # we may need to move to GPU explicitly if 'cuda' in device.type and torch.cuda.is_available(): model.to(device) if backend is None: backend = "turbo" # On GPU turbo is faster use_gpu = True else: if backend is None: backend = "onnxrt" # On CPU onnxrt is faster if backend == "turbo": embeddings = BertEmbeddings.from_torch(model.embeddings) encoder = BertEncoder.from_torch(model.encoder) bertmodel_nopooler = BertModelNoPooler(embeddings, encoder) pooler = BertPooler.from_torch(model.pooler) return BertModel(bertmodel_nopooler, pooler, "turbo", model.config) elif backend == "onnxrt": import onnx import onnxruntime import onnxruntime.backend inputs = { 'input_ids': torch.randint(32, [2, 32], dtype=torch.long).to( device), # list of numerical ids for the tokenised text 'attention_mask': torch.ones([2, 32], dtype=torch.long).to(device), # dummy list of ones 'token_type_ids': torch.ones([2, 32], dtype=torch.long).to(device), # dummy list of ones } onnx_model_path = "/tmp/temp_turbo_onnx.model" with open(onnx_model_path, 'wb') as outf: torch.onnx.export( model=model, args=(inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids'] ), # model input (or a tuple for multiple inputs) f=outf, input_names=[ 'input_ids', 'attention_mask', 'token_type_ids' ], opset_version=11, # the ONNX version to export the model to do_constant_folding= True, # whether to execute constant folding for optimization output_names=['output'], dynamic_axes={ 'input_ids': [0, 1], 'attention_mask': [0, 1], 'token_type_ids': [0, 1] }) # num_threads = "8" # os.environ['OMP_NUM_THREADS'] = str(num_threads) # os.environ['MKL_NUM_THREADS'] = str(num_threads) onnx_model = onnx.load_model(f=onnx_model_path) onnx_model = onnxruntime.backend.prepare( model=onnx_model, device='GPU' if use_gpu else "CPU", graph_optimization_level=onnxruntime.GraphOptimizationLevel. ORT_ENABLE_ALL) return BertModel(onnx_model, None, "onnxrt")
class TestBertModel(unittest.TestCase): def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = BertConfig() self.torch_model = BertModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.BertModel.from_torch( self.torch_model, self.test_device, "turbo") def check_torch_and_turbo(self, use_cuda, batch_size, seq_len, use_memory_opt=True): self.init_data(use_cuda) num_iter = 1 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'BertModel PyTorch({device_name}) QPS {torch_qps}') turbo_model = (lambda: self.turbo_model(input_ids)) if use_memory_opt: turbo_transformers.bert_opt_mem_allocate_api( input_ids.size()[0], # batch input_ids.size()[1], # seq_len self.cfg.num_attention_heads, self.cfg.hidden_size, self.cfg.num_hidden_layers, "GPU" if 'cuda' in input_ids.device.type else "CPU") with turbo_transformers.pref_guard("bert_perf") as perf: turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}') print(f"batch {batch_size} seq_len {seq_len}") print(torch.max(torch_result[0].cpu() - turbo_result[0].cpu())) self.assertTrue( numpy.allclose(torch_result[0].cpu(), turbo_result[0].cpu(), atol=1e-2, rtol=1e-3)) def bert_model_test_helper(self, use_memory_opt=False): if use_memory_opt: turbo_transformers.reset_allocator_schema("model-aware") for batch_size in [2, 4, 1]: for seq_len in [50, 4, 16]: if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True, batch_size=batch_size, seq_len=seq_len, use_memory_opt=use_memory_opt) self.check_torch_and_turbo(use_cuda=False, batch_size=batch_size, seq_len=seq_len, use_memory_opt=use_memory_opt) if use_memory_opt: turbo_transformers.reset_allocator_schema("naive") def test_bert_model(self): # self.bert_model_test_helper(True) self.bert_model_test_helper(False)
class TestBertModel(unittest.TestCase): def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(1) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = BertConfig() self.torch_model = BertModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.BertModel.from_torch( self.torch_model, self.test_device) self.turbo_pooler_model = turbo_transformers.BertModelWithPooler.from_torch( self.torch_model, self.test_device) def check_torch_and_turbo(self, use_cuda, use_pooler): self.init_data(use_cuda) num_iter = 2 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(2, 32), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'BertModel Plain PyTorch({device_name}) QPS {torch_qps}') turbo_model = ( lambda: self.turbo_pooler_model(input_ids)) if use_pooler else ( lambda: self.turbo_model(input_ids)) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}') torch_result_final = (torch_result[1]).cpu().numpy( ) if use_pooler else torch_result[0][:, 0].cpu().numpy() turbo_result_final = turbo_result[0].cpu().numpy() #TODO(jiaruifang, v_cshi) check why pooler introduce more difference if use_pooler: print( "encode output diff: ", numpy.max((torch_result[0][:, 0]).cpu().numpy() - turbo_result[1].cpu().numpy()).reshape(-1)) print( "pooler output diff: ", numpy.max( (turbo_result_final - torch_result_final).reshape(-1))) (atol, rtol) = (1e-2, 1e-2) if use_pooler else (5e-3, 1e-4) self.assertTrue( numpy.allclose(torch_result_final, turbo_result_final, atol=atol, rtol=rtol)) def test_bert_model(self): if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True, use_pooler=False) self.check_torch_and_turbo(use_cuda=True, use_pooler=True) self.check_torch_and_turbo(use_cuda=False, use_pooler=False) self.check_torch_and_turbo(use_cuda=False, use_pooler=True)