def from_torch(model: TorchBertModel,
                device: Optional[torch.device] = None):
     if device is not None and 'cuda' in device.type and torch.cuda.is_available(
     ):
         model.to(device)
     embeddings = BertEmbeddings.from_torch(model.embeddings)
     encoder = BertEncoder.from_torch(model.encoder)
     return BertModelNoPooler(embeddings, encoder)
 def from_torch(
         model: TorchBertModel,  # from_torch函数实现
         device: Optional[torch.device] = None):
     if device is not None and 'cuda' in device.type and torch.cuda.is_available(
     ):
         model.to(device)
     bertmodel = turbo_transformers.BertModel.from_torch(model.bert)
     # We can copy the following code and do not change it
     # Notice: classifier is the class member of BertForSequenceClassification. If user define the other class member,
     # they need modify it here.
     return BertForSequenceClassification(bertmodel, model.classifier)
Example #3
0
class BertMultiTask:
    def __init__(self, job_config, use_pretrain, tokenizer, cache_dir, device, write_log, summary_writer):
        self.job_config = job_config

        if not use_pretrain:
            model_config = self.job_config.get_model_config()
            bert_config = BertConfig(**model_config)
            bert_config.vocab_size = len(tokenizer.vocab)

            self.bert_encoder = BertModel(bert_config)
        # Use pretrained bert weights
        else:
            self.bert_encoder = BertModel.from_pretrained(self.job_config.get_model_file_type())
            bert_config = self.bert_encoder.config
        self.bert_encoder.to(device)

        self.network=MTLRouting(self.bert_encoder, write_log = write_log, summary_writer = summary_writer)

        #config_data=self.config['data']
        loss_calculation = BertPretrainingLoss(self.bert_encoder, bert_config)
        loss_calculation.to(device)
        # Pretrain Dataset
        self.network.register_batch(BatchType.PRETRAIN_BATCH, "pretrain_dataset", loss_calculation=loss_calculation)

        self.device=device
        # self.network = self.network.float()
        # print(f"Bert ID: {id(self.bert_encoder)}  from GPU: {dist.get_rank()}")

    def save(self, filename: str):
        network=self.network.module
        return torch.save(network.state_dict(), filename)

    def load(self, model_state_dict: str):
        return self.network.module.load_state_dict(torch.load(model_state_dict, map_location=lambda storage, loc: storage))

    def move_batch(self, batch, non_blocking=False):
        return batch.to(self.device, non_blocking)

    def eval(self):
        self.network.eval()

    def train(self):
        self.network.train()

    def save_bert(self, filename: str):
        return torch.save(self.bert_encoder.state_dict(), filename)

    def to(self, device):
        assert isinstance(device, torch.device)
        self.network.to(device)

    def half(self):
        self.network.half()
Example #4
0
class TestBertModel(unittest.TestCase):
    def init_data(self, use_cuda) -> None:
        torch.set_grad_enabled(False)
        torch.set_num_threads(4)
        turbo_transformers.set_num_threads(4)
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')

        self.cfg = BertConfig()
        self.torch_model = BertModel(self.cfg)
        self.torch_model.eval()

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)

        self.turbo_model = turbo_transformers.BertModel.from_torch(
            self.torch_model, self.test_device)

    def check_torch_and_turbo(self, use_cuda):
        self.init_data(use_cuda)
        num_iter = 1
        device_name = "GPU" if use_cuda else "CPU"
        input_ids = torch.randint(low=0,
                                  high=self.cfg.vocab_size - 1,
                                  size=(1, 10),
                                  dtype=torch.long,
                                  device=self.test_device)

        torch_model = lambda: self.torch_model(input_ids)
        torch_result, torch_qps, torch_time = \
            test_helper.run_model(torch_model, use_cuda, num_iter)
        print(f'BertModel PyTorch({device_name}) QPS {torch_qps}')

        turbo_model = (lambda: self.turbo_model(input_ids))

        with turbo_transformers.pref_guard("bert_perf") as perf:
            turbo_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)
        print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}')

        self.assertTrue(
            numpy.allclose(torch_result[0][:, 0].cpu(),
                           turbo_result[0].cpu(),
                           atol=1e-3,
                           rtol=1e-3))

    def test_bert_model(self):
        if torch.cuda.is_available() and \
            turbo_transformers.config.is_compiled_with_cuda():
            self.check_torch_and_turbo(use_cuda=True)
        self.check_torch_and_turbo(use_cuda=False)
class BertEmbed:
    def __init__(self):
        config = BertConfig.from_json_file(join(BERT_PATH, 'bert_config.json'))
        self.tokenizer = BertTokenizer(vocab_file=join(BERT_PATH, 'vocab.txt'))
        self.model = BertModel(config, add_pooling_layer=False)
        load_tf_weights_in_bert(self.model,
                                tf_checkpoint_path=join(
                                    BERT_PATH, 'bert_model.ckpt'),
                                strip_bert=True)
        self.model.to(PT_DEVICE)
        self.model.eval()

    def get_embedding(self, sentences):
        x = self.tokenizer(sentences, return_tensors='pt',
                           padding=True).to(PT_DEVICE)
        with torch.no_grad():
            output = self.model(**x)[0]
        return output.cpu().numpy()
    def from_torch(model: TorchBertModel,
                   device: Optional[torch.device] = None,
                   backend: Optional[str] = None,
                   use_memory_opt=False):
        """
        Args:
            model : a PyTorch Bert Model
            device : cpu or GPU
            backend : a string to indicates kernel provides
            Four options. [onnxrt-cpu, onnxrt-gpu, turbo-cpu, turbo-gpu]
            use_memory_opt [bool] whether or not use memory opt for variable length inputs.
        """
        use_gpu = False
        if device is None:
            device = model.device
        # we may need to move to GPU explicitly
        if 'cuda' in device.type and torch.cuda.is_available():
            model.to(device)
            if backend is None:
                backend = "turbo"  # On GPU turbo is faster
            use_gpu = True
        else:
            if backend is None:
                backend = "onnxrt"  # On CPU onnxrt is faster

        if backend == "turbo":
            embeddings = BertEmbeddings.from_torch(model.embeddings)
            encoder = BertEncoder.from_torch(model.encoder)
            bertmodel_nopooler = BertModelNoPooler(embeddings, encoder)
            pooler = BertPooler.from_torch(model.pooler)
            return BertModel(bertmodel_nopooler, pooler, "turbo", model.config)
        elif backend == "onnxrt":
            import onnx
            import onnxruntime
            import onnxruntime.backend
            inputs = {
                'input_ids':
                torch.randint(32, [2, 32], dtype=torch.long).to(
                    device),  # list of numerical ids for the tokenised text
                'attention_mask':
                torch.ones([2, 32],
                           dtype=torch.long).to(device),  # dummy list of ones
                'token_type_ids':
                torch.ones([2, 32],
                           dtype=torch.long).to(device),  # dummy list of ones
            }
            onnx_model_path = "/tmp/temp_turbo_onnx.model"
            with open(onnx_model_path, 'wb') as outf:
                torch.onnx.export(
                    model=model,
                    args=(inputs['input_ids'], inputs['attention_mask'],
                          inputs['token_type_ids']
                          ),  # model input (or a tuple for multiple inputs)
                    f=outf,
                    input_names=[
                        'input_ids', 'attention_mask', 'token_type_ids'
                    ],
                    opset_version=11,  # the ONNX version to export the model to
                    do_constant_folding=
                    True,  # whether to execute constant folding for optimization
                    output_names=['output'],
                    dynamic_axes={
                        'input_ids': [0, 1],
                        'attention_mask': [0, 1],
                        'token_type_ids': [0, 1]
                    })
            # num_threads = "8"
            # os.environ['OMP_NUM_THREADS'] = str(num_threads)
            # os.environ['MKL_NUM_THREADS'] = str(num_threads)
            onnx_model = onnx.load_model(f=onnx_model_path)
            onnx_model = onnxruntime.backend.prepare(
                model=onnx_model,
                device='GPU' if use_gpu else "CPU",
                graph_optimization_level=onnxruntime.GraphOptimizationLevel.
                ORT_ENABLE_ALL)
            return BertModel(onnx_model, None, "onnxrt")
class TestBertModel(unittest.TestCase):
    def init_data(self, use_cuda) -> None:
        torch.set_grad_enabled(False)
        torch.set_num_threads(4)
        turbo_transformers.set_num_threads(4)
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')

        self.cfg = BertConfig()
        self.torch_model = BertModel(self.cfg)
        self.torch_model.eval()

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)

        self.turbo_model = turbo_transformers.BertModel.from_torch(
            self.torch_model, self.test_device, "turbo")

    def check_torch_and_turbo(self,
                              use_cuda,
                              batch_size,
                              seq_len,
                              use_memory_opt=True):
        self.init_data(use_cuda)
        num_iter = 1
        device_name = "GPU" if use_cuda else "CPU"
        input_ids = torch.randint(low=0,
                                  high=self.cfg.vocab_size - 1,
                                  size=(batch_size, seq_len),
                                  dtype=torch.long,
                                  device=self.test_device)

        torch_model = lambda: self.torch_model(input_ids)
        torch_result, torch_qps, torch_time = \
            test_helper.run_model(torch_model, use_cuda, num_iter)
        print(f'BertModel PyTorch({device_name}) QPS {torch_qps}')

        turbo_model = (lambda: self.turbo_model(input_ids))

        if use_memory_opt:
            turbo_transformers.bert_opt_mem_allocate_api(
                input_ids.size()[0],  # batch
                input_ids.size()[1],  # seq_len
                self.cfg.num_attention_heads,
                self.cfg.hidden_size,
                self.cfg.num_hidden_layers,
                "GPU" if 'cuda' in input_ids.device.type else "CPU")

        with turbo_transformers.pref_guard("bert_perf") as perf:
            turbo_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)
        print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}')

        print(f"batch {batch_size} seq_len {seq_len}")
        print(torch.max(torch_result[0].cpu() - turbo_result[0].cpu()))
        self.assertTrue(
            numpy.allclose(torch_result[0].cpu(),
                           turbo_result[0].cpu(),
                           atol=1e-2,
                           rtol=1e-3))

    def bert_model_test_helper(self, use_memory_opt=False):
        if use_memory_opt:
            turbo_transformers.reset_allocator_schema("model-aware")

        for batch_size in [2, 4, 1]:
            for seq_len in [50, 4, 16]:
                if torch.cuda.is_available() and \
                        turbo_transformers.config.is_compiled_with_cuda():
                    self.check_torch_and_turbo(use_cuda=True,
                                               batch_size=batch_size,
                                               seq_len=seq_len,
                                               use_memory_opt=use_memory_opt)
                self.check_torch_and_turbo(use_cuda=False,
                                           batch_size=batch_size,
                                           seq_len=seq_len,
                                           use_memory_opt=use_memory_opt)

        if use_memory_opt:
            turbo_transformers.reset_allocator_schema("naive")

    def test_bert_model(self):
        # self.bert_model_test_helper(True)
        self.bert_model_test_helper(False)
class TestBertModel(unittest.TestCase):
    def init_data(self, use_cuda) -> None:
        torch.set_grad_enabled(False)
        torch.set_num_threads(1)
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')

        self.cfg = BertConfig()
        self.torch_model = BertModel(self.cfg)
        self.torch_model.eval()

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)

        self.turbo_model = turbo_transformers.BertModel.from_torch(
            self.torch_model, self.test_device)

        self.turbo_pooler_model = turbo_transformers.BertModelWithPooler.from_torch(
            self.torch_model, self.test_device)

    def check_torch_and_turbo(self, use_cuda, use_pooler):
        self.init_data(use_cuda)
        num_iter = 2
        device_name = "GPU" if use_cuda else "CPU"
        input_ids = torch.randint(low=0,
                                  high=self.cfg.vocab_size - 1,
                                  size=(2, 32),
                                  dtype=torch.long,
                                  device=self.test_device)

        torch_model = lambda: self.torch_model(input_ids)
        torch_result, torch_qps, torch_time = \
            test_helper.run_model(torch_model, use_cuda, num_iter)
        print(f'BertModel Plain PyTorch({device_name}) QPS {torch_qps}')

        turbo_model = (
            lambda: self.turbo_pooler_model(input_ids)) if use_pooler else (
                lambda: self.turbo_model(input_ids))
        turbo_result, turbo_qps, turbo_time = \
            test_helper.run_model(turbo_model, use_cuda, num_iter)
        print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}')

        torch_result_final = (torch_result[1]).cpu().numpy(
        ) if use_pooler else torch_result[0][:, 0].cpu().numpy()

        turbo_result_final = turbo_result[0].cpu().numpy()

        #TODO(jiaruifang, v_cshi) check why pooler introduce more difference
        if use_pooler:
            print(
                "encode output diff: ",
                numpy.max((torch_result[0][:, 0]).cpu().numpy() -
                          turbo_result[1].cpu().numpy()).reshape(-1))
            print(
                "pooler output diff: ",
                numpy.max(
                    (turbo_result_final - torch_result_final).reshape(-1)))
        (atol, rtol) = (1e-2, 1e-2) if use_pooler else (5e-3, 1e-4)

        self.assertTrue(
            numpy.allclose(torch_result_final,
                           turbo_result_final,
                           atol=atol,
                           rtol=rtol))

    def test_bert_model(self):
        if torch.cuda.is_available() and \
            turbo_transformers.config.is_compiled_with_cuda():
            self.check_torch_and_turbo(use_cuda=True, use_pooler=False)
            self.check_torch_and_turbo(use_cuda=True, use_pooler=True)
        self.check_torch_and_turbo(use_cuda=False, use_pooler=False)
        self.check_torch_and_turbo(use_cuda=False, use_pooler=True)