def init_data(self, use_cuda): test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) self.cfg.output_attentions = True torch_attention = BertAttention(self.cfg) torch_attention.eval() if use_cuda: torch_attention.to(test_device) # Get FT Attention turbo_attention = turbo_transformers.BertAttention.from_torch( torch_attention) turbo_decoder_attention = turbo_transformers.MultiHeadedAttention.from_torch( torch_attention, is_trans_weight=False) hidden_size = self.cfg.hidden_size input_tensor = torch.rand(size=(batch_size, seq_length, hidden_size), dtype=torch.float32, device=test_device) attention_mask = torch.ones((batch_size, seq_length), dtype=torch.float32, device=test_device) attention_mask = attention_mask[:, None, None, :] attention_mask = (1.0 - attention_mask) * -10000.0 return torch_attention, turbo_attention, turbo_decoder_attention, input_tensor, attention_mask
def init_data(self, use_cuda): self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) torch.set_grad_enabled(False) self.torch_ffn = DistilFFN(self.cfg) self.torch_ffn.eval() self.output_layer_norm = torch.nn.LayerNorm( normalized_shape=self.cfg.dim, eps=1e-12) if use_cuda: self.torch_ffn.to(self.test_device) self.output_layer_norm.to(self.test_device) self.turbo_ffn = turbo_transformers.DistrillFFN.from_torch( self.torch_ffn, self.output_layer_norm) # (batch_size, input_len, model_dim) self.inputs = torch.rand(size=(batch_size, input_len, self.cfg.dim), dtype=torch.float32, device=self.test_device) print(self.cfg.activation)
def benchmark_turbo_transformers(model_name: str, seq_len: int, batch_size: int, n: int, num_threads: int): import torch import transformers import contexttimer import turbo_transformers import cProfile import benchmark_helper turbo_transformers.set_num_threads(num_threads) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) model.eval() model = turbo_transformers.BertModel.from_torch(model) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) model.eval() model = turbo_transformers.AlbertModel.from_torch(model) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) model.eval() model = turbo_transformers.RobertaModel.from_torch(model) else: raise (f"benchmark does not support {model_name}") cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long) benchmark_helper.run_model(lambda: model(input_ids), False, n, batch_size, seq_len, "turbo", num_threads)
def init_data(self, use_cuda): self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) torch.set_grad_enabled(False) self.torch_transformer_block = DistilTransformerBlock(self.cfg) self.torch_transformer_block.eval() if use_cuda: self.torch_transformer_block.to(self.test_device) self.turbo_transformer_block = turbo_transformers.DistrillTransformerBlock.from_torch( self.torch_transformer_block) # (batch_size, input_len, model_dim) self.attention_mask = torch.ones((batch_size, input_len), dtype=torch.float32, device=self.test_device) self.inputs = torch.rand(size=(batch_size, input_len, self.cfg.dim), dtype=torch.float32, device=self.test_device)
def init_data(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = AlbertConfig() self.torch_layer = AlbertLayer(self.cfg) if torch.cuda.is_available(): self.torch_layer.to(self.test_device) self.torch_layer.eval() self.hidden_size = self.cfg.hidden_size self.input_tensor = torch.rand(size=(batch_size, seq_length, self.hidden_size), dtype=torch.float32, device=self.test_device) self.attention_mask = torch.ones((batch_size, seq_length), dtype=torch.float32, device=self.test_device) self.attention_mask = self.attention_mask[:, None, None, :] self.attention_mask = (1.0 - self.attention_mask) * -10000.0 self.turbo_layer = turbo_transformers.AlbertLayer.from_torch( self.torch_layer)
def init_data(self, use_cuda): test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) self.cfg.output_attentions = True self.torch_attention = DistilAttention(self.cfg) self.torch_sa_layer_norm = nn.LayerNorm( normalized_shape=self.cfg.dim, eps=1e-12) self.torch_attention.eval() self.torch_sa_layer_norm.eval() if use_cuda: self.torch_attention.to(test_device) self.torch_sa_layer_norm.to(test_device) # Get FT Attention self.turbo_attention = turbo_transformers.DistillBertAttention.from_torch( self.torch_attention, self.torch_sa_layer_norm) hidden_size = self.cfg.hidden_size self.input_tensor = torch.rand(size=(batch_size, seq_length, hidden_size), dtype=torch.float32, device=test_device) # NOTE, the mask of distilled attention is different from huggingface bert attention. self.attention_mask = torch.ones((batch_size, seq_length), dtype=torch.float32, device=test_device)
def init_data(self, use_cuda): self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.model_dim = 1024 self.d_ff = 4096 torch.set_grad_enabled(False) onmt_ffn = PositionwiseFeedForward(self.model_dim, self.d_ff) onmt_ffn.eval() if use_cuda: onmt_ffn.to(self.test_device) turbo_ffn_trans = turbo_transformers.PositionwiseFeedForward.from_onmt( onmt_ffn, is_trans_weight=True) turbo_ffn_notrans = turbo_transformers.PositionwiseFeedForward.from_onmt( onmt_ffn, is_trans_weight=False) # (batch_size, input_len, model_dim) inputs = torch.rand(size=(batch_size, input_len, self.model_dim), dtype=torch.float32, device=self.test_device) return onmt_ffn, turbo_ffn_trans, turbo_ffn_notrans, inputs
def init_data(self, use_cuda): self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) torch.set_grad_enabled(False) self.torch_model = DistilBertModel(self.cfg) self.torch_model.eval() if use_cuda: self.torch_model.to(self.test_device) self.turbo_transformer = turbo_transformers.DistilBertModel.from_torch( self.torch_model) # (batch_size, input_len, model_dim) self.inputs = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(batch_size, input_len), dtype=torch.long, device=self.test_device) self.attention_mask = torch.ones((batch_size, input_len), dtype=torch.long, device=self.test_device) self.head_mask = [None] * self.cfg.num_hidden_layers
def benchmark_turbo_transformers(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool, enable_mem_opt: bool): import torch import transformers import turbo_transformers import benchmark_helper test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') cfg = None torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.BertModel.from_torch(model, backend="turbo") elif model_name == "albert": cfg = transformers.AlbertConfig(hidden_size=768, num_attention_heads=12, intermediate_size=3072) model = transformers.AlbertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.AlbertModel.from_torch(model) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.RobertaModel.from_torch(model) elif model_name == "distilbert": cfg = transformers.DistilBertConfig() model = transformers.DistilBertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.DistilBertModel.from_torch(model) else: raise (f"benchmark does not support {model_name}") turbo_transformers.set_num_threads(num_threads) if enable_random: if enable_mem_opt: turbo_transformers.reset_allocator_schema("model-aware") benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len, min_seq_len, "turbo", num_threads, cfg, enable_mem_opt, model_name) if enable_mem_opt: turbo_transformers.reset_allocator_schema("naive") else: input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n, batch_size, seq_len, "turbo", num_threads, enable_mem_opt, model_name)
def init_data(self, use_cuda): self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.head_count = 16 self.model_dim = 1024 #self.model_dim should % self.head_count = 0 self.size_per_head = int(self.model_dim / self.head_count) self.query_seq_len_list = query_seq_len_list self.key_seq_len_list = key_seq_len_list # build the torch model self.model = MultiHeadedAttention(self.head_count, self.model_dim) self.model.eval() if use_cuda: self.model.to(self.test_device) # prepare torch input data self.Q_list = [] for query_seq_len in query_seq_len_list: Q = torch.rand( size=( 1, query_seq_len, #from_seq self.model_dim), dtype=torch.float32, device=self.test_device) self.Q_list.append(Q) self.K_list = [] self.V_list = [] for key_seq_len in key_seq_len_list: K = torch.rand( size=( 1, key_seq_len, #from_seq self.model_dim), dtype=torch.float32, device=self.test_device) V = torch.rand( size=( 1, key_seq_len, #to_seq self.model_dim), dtype=torch.float32, device=self.test_device) self.K_list.append(K) self.V_list.append(V) # prepare turbo smart batch model self.turbo_smart_pad = turbo_transformers.MultiHeadedAttentionSmartBatch.from_onmt( self.model)
def init_data(self, use_cuda): self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.head_count = 16 self.model_dim = 1024 #self.model_dim should % self.head_count = 0 self.size_per_head = int(self.model_dim / self.head_count) onmt_multi_headed_attention = MultiHeadedAttention( self.head_count, self.model_dim) onmt_multi_headed_attention.eval() torch_layernorm = torch.nn.LayerNorm(self.model_dim, eps=1e-6) torch_layernorm.eval() if use_cuda: onmt_multi_headed_attention.to(self.test_device) torch_layernorm.to(self.test_device) K = torch.rand( size=( batch_size, key_seq_len, #from_seq self.model_dim), dtype=torch.float32, device=self.test_device) V = torch.rand(size=(batch_size, key_seq_len, self.model_dim), dtype=torch.float32, device=self.test_device) Q = torch.rand( size=( batch_size, query_seq_len, #to_seq self.model_dim), dtype=torch.float32, device=self.test_device) turbo_attn_trans = turbo_transformers.MultiHeadedAttention.from_onmt( onmt_multi_headed_attention, torch_layernorm, is_trans_weight=True) turbo_attn_notrans = turbo_transformers.MultiHeadedAttention.from_onmt( onmt_multi_headed_attention, torch_layernorm, is_trans_weight=False) if with_quantize_dynamic and not use_cuda: self.q_onmt_multi_headed_attention = torch.quantization.quantize_dynamic( onmt_multi_headed_attention) return onmt_multi_headed_attention, torch_layernorm, turbo_attn_trans, turbo_attn_notrans, Q, K, V
def test_smart_batch(use_cuda: bool): test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') cfg = transformers.BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) torch_model = transformers.BertModel(cfg) # model_id = "bert-base-uncased" # torch_model = transformers.BertModel.from_pretrained(model_id) torch_model.eval() torch_model.to(test_device) torch.set_grad_enabled(False) cfg = torch_model.config # use 4 threads for computing if not use_cuda: turbo_transformers.set_num_threads(4) # Initialize a turbo BertModel with smart batching from torch model. turbo_model = turbo_transformers.BertModelSmartBatch.from_torch( torch_model) # a batch of queries with different lengths. query_seq_len_list = [18, 2, 3, 51] input_list = [] # generate random inputs. Of course you can use real data. for query_seq_len in query_seq_len_list: input_seq = torch.randint(low=0, high=cfg.vocab_size - 1, size=(1, query_seq_len), dtype=torch.long, device=test_device) input_list.append(input_seq) # start inference s_res = serial_bert_inference(torch_model, input_list) b_res = batch_bert_inference(turbo_model, input_list, query_seq_len_list) print(torch.max(torch.abs(b_res - s_res))) assert (torch.max(torch.abs(b_res - s_res)) < 1e-2) start_time = time.time() for i in range(10): serial_bert_inference(torch_model, input_list) end_time = time.time() print("\ntorch time consum: {}".format(end_time - start_time)) start_time = time.time() for i in range(10): batch_bert_inference(turbo_model, input_list, query_seq_len_list) end_time = time.time() print("\nturbo time consum: {}".format(end_time - start_time))
def benchmark_turbo_transformers(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool): import torch import transformers import contexttimer import turbo_transformers import benchmark_helper test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') if use_gpu: print("using GPU") else: print("using CPU") cfg = None torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.BertModel.from_torch(model) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.AlbertModel.from_torch(model) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.RobertaModel.from_torch(model) else: raise (f"benchmark does not support {model_name}") turbo_transformers.set_num_threads(num_threads) if enable_random: benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len, min_seq_len, "turbo", num_threads, cfg) else: input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n, batch_size, seq_len, "turbo", num_threads)
def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = BertConfig() self.torch_model = BertModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.BertModel.from_torch( self.torch_model, self.test_device, "turbo")
def test(loadtype: LoadType): # use 4 threads for computing turbo_transformers.set_num_threads(4) model_id = "bert-base-uncased" model = transformers.BertModel.from_pretrained(model_id) model.eval() cfg = model.config input_ids = torch.tensor( ([12166, 10699, 16752, 4454], [5342, 16471, 817, 16022]), dtype=torch.long) position_ids = torch.tensor(([1, 0, 0, 0], [1, 1, 1, 0]), dtype=torch.long) segment_ids = torch.tensor(([1, 1, 1, 0], [1, 0, 0, 0]), dtype=torch.long) torch.set_grad_enabled(False) torch_res = model( input_ids, position_ids=position_ids, token_type_ids=segment_ids ) # sequence_output, pooled_output, (hidden_states), (attentions) print("torch bert sequence output: ", torch_res[0][:, 0, :]) #get the first sequence print("torch bert pooler output: ", torch_res[1]) # pooled_output # there are three ways to load pretrained model. if loadtype is LoadType.PYTORCH: # 1, from a PyTorch model, which has loaded a pretrained model tt_model = turbo_transformers.BertModel.from_torch(model) elif loadtype is LoadType.PRETRAINED: # 2. directly load from checkpoint (torch saved model) tt_model = turbo_transformers.BertModel.from_pretrained(model_id) elif loadtype is LoadType.NPZ: # 3. load model from npz if len(sys.argv) == 2: try: print(sys.argv[1]) in_file = sys.argv[1] except: sys.exit("ERROR. can not open ", sys.argv[1]) else: in_file = "/workspace/bert_torch.npz" tt_model = turbo_transformers.BertModel.from_npz(in_file, cfg) else: raise ("LoadType is not supported") res = tt_model( input_ids, position_ids=position_ids, token_type_ids=segment_ids) # sequence_output, pooled_output print("turbo bert sequence output:", res[0], res[0].size()) print("turbo bert pooler output: ", res[1]) # pooled_output
def init_data(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = BertConfig() self.torch_intermediate = BertIntermediate(self.cfg) if torch.cuda.is_available(): self.torch_intermediate.to(self.test_device) self.torch_intermediate.eval() self.turbo_intermediate = turbo_transformers.BertIntermediate.from_torch( self.torch_intermediate)
def test(use_cuda: bool): test_device_name = "GPU" if use_cuda else "CPU" test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') cfg = transformers.BertConfig() # use 4 threads for computing turbo_transformers.set_num_threads(4) input_ids = np.array( ([12166, 10699, 16752, 4454], [5342, 16471, 817, 16022]), dtype=np.int64) segment_ids = np.array(([1, 1, 1, 0], [1, 0, 0, 0]), dtype=np.int64) input_ids_tensor = turbo_transformers.nparray2tensor( input_ids, test_device_name) segment_ids_tensor = turbo_transformers.nparray2tensor( segment_ids, test_device_name) # 3. load model from npz if len(sys.argv) == 2: try: print(sys.argv[1]) in_file = sys.argv[1] except: sys.exit("ERROR. can not open ", sys.argv[1]) else: in_file = "/home/jiaruifang/codes/TurboTransformers/bert.npz" # 255 MiB tt_model = turbo_transformers.BertModel.from_npz(in_file, cfg, test_device) # 1169 MiB start_time = time.time() for _ in range(10): res = tt_model(input_ids_tensor, token_type_ids=segment_ids_tensor, return_type=turbo_transformers.ReturnType.NUMPY ) # sequence_output, pooled_output end_time = time.time() print("turbo bert sequence output:", res[0][:, 0, :]) print("turbo bert pooler output: ", res[1]) # pooled_output print("\nturbo time consum: {}".format(end_time - start_time))
def init_data(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = AlbertConfig(hidden_size=768, num_attention_heads=12, intermediate_size=3072) self.torch_model = AlbertModel(self.cfg) if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.torch_model.eval() self.hidden_size = self.cfg.hidden_size self.turbo_model = turbo_transformers.AlbertModel.from_torch( self.torch_model)
def test(loadtype: LoadType, use_cuda: bool): cfg = transformers.AlbertConfig(hidden_size=768, num_attention_heads=12, intermediate_size=3072) model = transformers.AlbertModel(cfg) model.eval() torch.set_grad_enabled(False) test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') cfg = model.config # use 4 threads for computing turbo_transformers.set_num_threads(4) input_ids = torch.tensor( ([12166, 10699, 16752, 4454], [5342, 16471, 817, 16022]), dtype=torch.long) model.to(test_device) start_time = time.time() for _ in range(10): torch_res = model(input_ids) end_time = time.time() print("\ntorch time consum: {}".format(end_time - start_time)) # there are three ways to load pretrained model. if loadtype is LoadType.PYTORCH: # 1, from a PyTorch model, which has loaded a pretrained model tt_model = turbo_transformers.AlbertModel.from_torch(model) else: raise ("LoadType is not supported") start_time = time.time() for _ in range(10): res = tt_model(input_ids) # sequence_output, pooled_output end_time = time.time() print("\nturbo time consum: {}".format(end_time - start_time)) assert (numpy.max( numpy.abs(res[0].cpu().numpy() - torch_res[0].cpu().numpy())) < 0.1)
def init_data(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = AlbertConfig() self.torch_model = AlbertModel(self.cfg) if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.torch_model.eval() self.hidden_size = self.cfg.hidden_size self.input_tensor = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(batch_size, seq_length), device=self.test_device) self.turbo_model = turbo_transformers.AlbertModel.from_torch( self.torch_model)
def benchmark_turbo_transformers(model: str, seq_len: int, batch_size: int, n: int, num_threads: int): import torch import transformers import contexttimer import turbo_transformers import cProfile import benchmark_helper turbo_transformers.set_num_threads(num_threads) model_id = "bert-base-uncased" model = transformers.BertModel.from_pretrained( model_id) # type: transformers.BertModel model.eval() cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long) model = turbo_transformers.BertModel.from_torch(model) benchmark_helper.run_model(lambda: model(input_ids), False, n, batch_size, seq_len, "turbo", num_threads)
def test(use_cuda): torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') cfg = RobertaConfig() torch_model = RobertaModel(cfg) torch_model.eval() if torch.cuda.is_available(): torch_model.to(test_device) turbo_model = turbo_transformers.RobertaModel.from_torch( torch_model, test_device) input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(1, 10), dtype=torch.long, device=test_device) torch_result = torch_model(input_ids) torch_result_final = torch_result[0][:, 0].cpu().numpy() turbo_result = turbo_model(input_ids) turbo_result_final = turbo_result[0].cpu().numpy() # See the differences # print(numpy.size(torch_result_final), numpy.size(turbo_result_final)) # print(torch_result_final - turbo_result_final) assert (numpy.allclose(torch_result_final, turbo_result_final, atol=1e-3, rtol=1e-3))
def init_data(self, use_cuda): self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.model_dim = 1024 self.onmt_encoder = TransformerEncoderLayer(d_model=self.model_dim, heads=8, d_ff=1024, dropout=0., attention_dropout=0.) self.onmt_encoder.eval() if use_cuda: self.onmt_encoder.to(self.test_device) self.turbo_encoder = turbo_transformers.TransformerEncoderLayer.from_onmt( self.onmt_encoder) # https://pytorch.org/docs/stable/quantization.html if with_quantize_dynamic and not use_cuda: self.quantized_onmt_encoder = torch.quantization.quantize_dynamic( self.onmt_encoder)
def __init__(self, model, fields, src_reader, tgt_reader, gpu=-1, n_best=1, min_length=0, max_length=100, ratio=0., beam_size=30, random_sampling_topk=1, random_sampling_temp=1, stepwise_penalty=None, dump_beam=False, block_ngram_repeat=0, ignore_when_blocking=frozenset(), replace_unk=False, phrase_table="", data_type="text", verbose=False, report_time=False, copy_attn=False, global_scorer=None, out_file=None, report_align=False, report_score=True, logger=None, seed=-1): self.model = model # fjr add turbo turbo_transformers.set_num_threads(4) self.turbo_decoder = turbo_transformers.TransformerDecoder.from_onmt( self.model.decoder) self.fields = fields tgt_field = dict(self.fields)["tgt"].base_field self._tgt_vocab = tgt_field.vocab self._tgt_eos_idx = self._tgt_vocab.stoi[tgt_field.eos_token] self._tgt_pad_idx = self._tgt_vocab.stoi[tgt_field.pad_token] self._tgt_bos_idx = self._tgt_vocab.stoi[tgt_field.init_token] self._tgt_unk_idx = self._tgt_vocab.stoi[tgt_field.unk_token] self._tgt_vocab_len = len(self._tgt_vocab) self._gpu = gpu self._use_cuda = gpu > -1 self._dev = torch.device("cuda", self._gpu) \ if self._use_cuda else torch.device("cpu") self.n_best = n_best self.max_length = max_length self.beam_size = beam_size self.random_sampling_temp = random_sampling_temp self.sample_from_topk = random_sampling_topk self.min_length = min_length self.ratio = ratio self.stepwise_penalty = stepwise_penalty self.dump_beam = dump_beam self.block_ngram_repeat = block_ngram_repeat self.ignore_when_blocking = ignore_when_blocking self._exclusion_idxs = { self._tgt_vocab.stoi[t] for t in self.ignore_when_blocking } self.src_reader = src_reader self.tgt_reader = tgt_reader self.replace_unk = replace_unk if self.replace_unk and not self.model.decoder.attentional: raise ValueError("replace_unk requires an attentional decoder.") self.phrase_table = phrase_table self.data_type = data_type self.verbose = verbose self.report_time = report_time self.copy_attn = copy_attn self.global_scorer = global_scorer if self.global_scorer.has_cov_pen and \ not self.model.decoder.attentional: raise ValueError( "Coverage penalty requires an attentional decoder.") self.out_file = out_file self.report_align = report_align self.report_score = report_score self.logger = logger self.use_filter_pred = False self._filter_pred = None # for debugging self.beam_trace = self.dump_beam != "" self.beam_accum = None if self.beam_trace: self.beam_accum = { "predicted_ids": [], "beam_parent_ids": [], "scores": [], "log_probs": [] } set_random_seed(seed, self._use_cuda)
def test(loadtype: LoadType, use_cuda: bool): test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') model_id = "bert-base-uncased" model = transformers.BertModel.from_pretrained(model_id) model.eval() model.to(test_device) torch.set_grad_enabled(False) cfg = model.config # use 4 threads for computing turbo_transformers.set_num_threads(4) input_ids = torch.tensor( ([12166, 10699, 16752, 4454], [5342, 16471, 817, 16022]), dtype=torch.long, device=test_device) # position_ids = torch.tensor(([1, 0, 0, 0], [1, 1, 1, 0]), dtype=torch.long, device = test_device) segment_ids = torch.tensor(([1, 1, 1, 0], [1, 0, 0, 0]), dtype=torch.long, device=test_device) start_time = time.time() for _ in range(10): torch_res = model( input_ids, token_type_ids=segment_ids ) # sequence_output, pooled_output, (hidden_states), (attentions) end_time = time.time() print("\ntorch time consum: {}".format(end_time - start_time)) print("torch bert sequence output: ", torch_res[0][:, 0, :]) #get the first sequence print("torch bert pooler output: ", torch_res[1]) # pooled_output # there are three ways to load pretrained model. if loadtype is LoadType.PYTORCH: # 1, from a PyTorch model, which has loaded a pretrained model # note that you can choose "turbo" or "onnxrt" as backend # "turbo" is a hand-crafted implementation and optimized with OMP. tt_model = turbo_transformers.BertModel.from_torch( model, test_device, "turbo") elif loadtype is LoadType.PRETRAINED: # 2. directly load from checkpoint (torch saved model) tt_model = turbo_transformers.BertModel.from_pretrained( model_id, test_device) elif loadtype is LoadType.NPZ: # 3. load model from npz if len(sys.argv) == 2: try: print(sys.argv[1]) in_file = sys.argv[1] except: sys.exit("ERROR. can not open ", sys.argv[1]) else: in_file = "/workspace/bert_torch.npz" tt_model = turbo_transformers.BertModel.from_npz( in_file, cfg, test_device) else: raise ("LoadType is not supported") start_time = time.time() for _ in range(10): res = tt_model( input_ids, token_type_ids=segment_ids) # sequence_output, pooled_output end_time = time.time() print("turbo bert sequence output:", res[0][:, 0, :]) print("turbo bert pooler output: ", res[1]) # pooled_output print("\nturbo time consum: {}".format(end_time - start_time)) assert (torch.max(torch.abs(res[0] - torch_res[0])) < 0.2)
return BertForSequenceClassification(bertmodel, model.classifier) @staticmethod def from_pretrained(model_id_or_path: str, device: Optional[torch.device] = None): # First, Use the function of from_pretrained to load the model you trained. torch_model = TorchBertForSequenceClassification.from_pretrained( model_id_or_path) # Then, Use the init function of the acceleration model to get it. model = BertForSequenceClassification.from_torch(torch_model, device) model._torch_model = torch_model # prevent destroy torch model. return model # use 4 threads for BERT inference turbo_transformers.set_num_threads(4) model_id = os.path.join( os.path.dirname(__file__), 'test-seq-classification-model') # the model of huggingface's path tokenizer = BertTokenizer.from_pretrained( model_id) # the initialization of tokenizer turbo_model = BertForSequenceClassification.from_pretrained( model_id, torch.device('cpu:0')) # the initialization of the acceleration model # predict after loading the model input_ids = torch.tensor( tokenizer.encode('测试一下bert模型的性能和精度是不是符合要求?', add_special_tokens=True)).unsqueeze(0) torch_result = turbo_model(input_ids)