Esempio n. 1
0
 def from_torch(model: TorchBertModel,
                device: Optional[torch.device] = None):
     if device is not None and 'cuda' in device.type and torch.cuda.is_available(
     ):
         model.to(device)
     embeddings = BertEmbeddings.from_torch(model.embeddings)
     encoder = BertEncoder.from_torch(model.encoder)
     return BertModelNoPooler(embeddings, encoder)
 def from_torch(
         model: TorchBertModel,
         device: Optional[torch.device] = None  # from_torch函数实现
 ):
     if device is not None and "cuda" in device.type and torch.cuda.is_available(
     ):
         model.to(device)
     bertmodel = turbo_transformers.BertModel.from_torch(model.bert)
     # We can copy the following code and do not change it
     # Notice: classifier is the class member of BertForSequenceClassification. If user define the other class member,
     # they need modify it here.
     return BertForSequenceClassification(bertmodel, model.classifier)
class TestBertModel(unittest.TestCase):
    def init_data(self, use_cuda) -> None:
        torch.set_grad_enabled(False)
        torch.set_num_threads(4)
        turbo_transformers.set_num_threads(4)
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')

        self.cfg = BertConfig()
        self.torch_model = BertModel(self.cfg)
        self.torch_model.eval()

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)

        self.turbo_model = turbo_transformers.BertModel.from_torch(
            self.torch_model, self.test_device, "turbo")

    def check_torch_and_turbo(self,
                              use_cuda,
                              batch_size,
                              seq_len,
                              use_memory_opt=True):
        self.init_data(use_cuda)
        num_iter = 1
        device_name = "GPU" if use_cuda else "CPU"
        input_ids = torch.randint(low=0,
                                  high=self.cfg.vocab_size - 1,
                                  size=(batch_size, seq_len),
                                  dtype=torch.long,
                                  device=self.test_device)

        torch_model = lambda: self.torch_model(input_ids)
        torch_result, torch_qps, torch_time = \
            test_helper.run_model(torch_model, use_cuda, num_iter)
        print(f'BertModel PyTorch({device_name}) QPS {torch_qps}')

        turbo_model = (lambda: self.turbo_model(input_ids))

        if use_memory_opt:
            turbo_transformers.bert_opt_mem_allocate_api(
                input_ids.size()[0],  # batch
                input_ids.size()[1],  # seq_len
                self.cfg.num_attention_heads,
                self.cfg.hidden_size,
                self.cfg.num_hidden_layers,
                "GPU" if 'cuda' in input_ids.device.type else "CPU")

        with turbo_transformers.pref_guard("bert_perf") as perf:
            turbo_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)
        print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}')

        print(f"batch {batch_size} seq_len {seq_len}")
        print(torch.max(torch_result[0].cpu() - turbo_result[0].cpu()))
        self.assertTrue(
            numpy.allclose(torch_result[0].cpu(),
                           turbo_result[0].cpu(),
                           atol=1e-2,
                           rtol=1e-3))

    def bert_model_test_helper(self, use_memory_opt=False):
        if use_memory_opt:
            turbo_transformers.reset_allocator_schema("model-aware")

        for batch_size in [2, 4, 1]:
            for seq_len in [50, 4, 16]:
                if torch.cuda.is_available() and \
                        turbo_transformers.config.is_compiled_with_cuda():
                    self.check_torch_and_turbo(use_cuda=True,
                                               batch_size=batch_size,
                                               seq_len=seq_len,
                                               use_memory_opt=use_memory_opt)
                self.check_torch_and_turbo(use_cuda=False,
                                           batch_size=batch_size,
                                           seq_len=seq_len,
                                           use_memory_opt=use_memory_opt)

        if use_memory_opt:
            turbo_transformers.reset_allocator_schema("naive")

    def test_bert_model(self):
        # self.bert_model_test_helper(True)
        self.bert_model_test_helper(False)
Esempio n. 4
0
    def from_torch(model: TorchBertModel,
                   device: Optional[torch.device] = None,
                   backend: Optional[str] = None,
                   use_memory_opt=False):
        """
        Args:
            model : a PyTorch Bert Model
            device : cpu or GPU
            backend : a string to indicates kernel provides
            Four options. [onnxrt-cpu, onnxrt-gpu, turbo-cpu, turbo-gpu]
            use_memory_opt [bool] whether or not use memory opt for variable length inputs.
        """
        use_gpu = False
        if device is None:
            device = model.device
        # we may need to move to GPU explicitly
        if 'cuda' in device.type and torch.cuda.is_available():
            model.to(device)
            if backend is None:
                backend = "turbo"  # On GPU turbo is faster
            use_gpu = True
        else:
            if backend is None:
                backend = "onnxrt"  # On CPU onnxrt is faster

        if backend == "turbo":
            embeddings = BertEmbeddings.from_torch(model.embeddings)
            encoder = BertEncoder.from_torch(model.encoder)
            bertmodel_nopooler = BertModelNoPooler(embeddings, encoder)
            pooler = BertPooler.from_torch(model.pooler)
            return BertModel(bertmodel_nopooler, pooler, "turbo", model.config)
        elif backend == "onnxrt":
            import onnx
            import onnxruntime
            import onnxruntime.backend
            inputs = {
                'input_ids':
                torch.randint(32, [2, 32], dtype=torch.long).to(
                    device),  # list of numerical ids for the tokenised text
                'attention_mask':
                torch.ones([2, 32],
                           dtype=torch.long).to(device),  # dummy list of ones
                'token_type_ids':
                torch.ones([2, 32],
                           dtype=torch.long).to(device),  # dummy list of ones
            }
            onnx_model_path = "/tmp/temp_turbo_onnx.model"
            with open(onnx_model_path, 'wb') as outf:
                torch.onnx.export(
                    model=model,
                    args=(inputs['input_ids'], inputs['attention_mask'],
                          inputs['token_type_ids']
                          ),  # model input (or a tuple for multiple inputs)
                    f=outf,
                    input_names=[
                        'input_ids', 'attention_mask', 'token_type_ids'
                    ],
                    opset_version=11,  # the ONNX version to export the model to
                    do_constant_folding=
                    True,  # whether to execute constant folding for optimization
                    output_names=['output'],
                    dynamic_axes={
                        'input_ids': [0, 1],
                        'attention_mask': [0, 1],
                        'token_type_ids': [0, 1]
                    })
            # num_threads = "8"
            # os.environ['OMP_NUM_THREADS'] = str(num_threads)
            # os.environ['MKL_NUM_THREADS'] = str(num_threads)
            onnx_model = onnx.load_model(f=onnx_model_path)
            onnx_model = onnxruntime.backend.prepare(
                model=onnx_model,
                device='GPU' if use_gpu else "CPU",
                graph_optimization_level=onnxruntime.GraphOptimizationLevel.
                ORT_ENABLE_ALL)
            return BertModel(onnx_model, None, "onnxrt")