def benchmark_torch(model_name: str, seq_len: int, batch_size: int, n: int, num_threads: int): import torch import transformers import contexttimer import benchmark_helper torch.set_num_threads(num_threads) torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long) benchmark_helper.run_model(lambda: model(input_ids), False, n, batch_size, seq_len, "torch", num_threads)
def generate_onnx_model(model_name: str, filename: str, seq_len: int, batch_size: int, backend: str): import transformers import torch import os test_device = torch.device('cuda:0') if backend == "GPU" else torch.device( 'cpu:0') torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() model.to(test_device) cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) with open(filename, 'wb') as outf: torch.onnx.export(model=model, args=(input_ids, ), f=outf) outf.flush() return cfg.vocab_size
def benchmark_turbo_transformers(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool, enable_mem_opt: bool): import torch import transformers import turbo_transformers import benchmark_helper test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') cfg = None torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.BertModel.from_torch(model, backend="turbo") elif model_name == "albert": cfg = transformers.AlbertConfig(hidden_size=768, num_attention_heads=12, intermediate_size=3072) model = transformers.AlbertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.AlbertModel.from_torch(model) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.RobertaModel.from_torch(model) elif model_name == "distilbert": cfg = transformers.DistilBertConfig() model = transformers.DistilBertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.DistilBertModel.from_torch(model) else: raise (f"benchmark does not support {model_name}") turbo_transformers.set_num_threads(num_threads) if enable_random: if enable_mem_opt: turbo_transformers.reset_allocator_schema("model-aware") benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len, min_seq_len, "turbo", num_threads, cfg, enable_mem_opt, model_name) if enable_mem_opt: turbo_transformers.reset_allocator_schema("naive") else: input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n, batch_size, seq_len, "turbo", num_threads, enable_mem_opt, model_name)
def generate_onnx_model(model_name: str, use_gpu: bool, filename: str, seq_len: int, batch_size: int, backend: str, use_dynamic_axes: bool = False): import transformers import torch import os test_device = torch.device( 'cuda:0') if backend == "GPU" and use_gpu else torch.device('cpu:0') torch.set_grad_enabled(False) if model_name == "bert": # use a real model to check the correctness if checkonnxrest: model = transformers.BertModel.from_pretrained("bert-base-uncased") else: cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() model.to(test_device) cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) with open(filename, 'wb') as outf: if not use_dynamic_axes: torch.onnx.export(model=model, args=(input_ids, ), f=outf) else: torch.onnx.export(model=model, args=(input_ids, ), f=outf, input_names=['input'], output_names=['output'], dynamic_axes={ 'input': [0, 1], 'output': [0, 1] }) # If not intended to make onnxruntime support variable batch size and sequence length, # you can unset the parameter `dynamic_axes`. # For some model, you have to try `opset_version=12` outf.flush() return cfg.vocab_size, cfg
def __init__(self, albert_for_math_config): super(AlbertForMathQuestions, self) self.albert = transformers.AlbertModel(albert_for_math_config) self.dense = nn.Linear(albert_for_math_config.hidden_size, albert_for_math_config.embedding_size) self.output_projection = nn.Linear( albert_for_math_config.embedding_size, albert_for_math_config.vocab_size) self.layer_norm = nn.LayerNorm(albert_for_math_config.embedding_size) self.activation = transformers.ACT2FN[config.hidden_act]
def benchmark_turbo_transformers(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool): import torch import transformers import contexttimer import turbo_transformers import benchmark_helper test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') if use_gpu: print("using GPU") else: print("using CPU") cfg = None torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.BertModel.from_torch(model) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.AlbertModel.from_torch(model) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.RobertaModel.from_torch(model) else: raise (f"benchmark does not support {model_name}") turbo_transformers.set_num_threads(num_threads) if enable_random: benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len, min_seq_len, "turbo", num_threads, cfg) else: input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n, batch_size, seq_len, "turbo", num_threads)
def benchmark_torch_jit(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool, enable_mem_opt: bool): import transformers import contexttimer import torch.jit torch.set_num_threads(num_threads) torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long) model = torch.jit.trace(model, (input_ids, )) with torch.jit.optimized_execution(True): model(input_ids) with contexttimer.Timer() as t: for _ in range(n): model(input_ids) print( json.dumps({ "QPS": n / t.elapsed, "elapsed": t.elapsed, "n": n, "batch_size": batch_size, "seq_len": seq_len, "framework": "torch_jit", "n_threads": num_threads, "model_name": model_name }))
def benchmark_turbo_transformers(model_name: str, seq_len: int, batch_size: int, n: int): import torch import transformers import contexttimer import turbo_transformers import benchmark_helper if not torch.cuda.is_available(): print("cuda is not available for torch") return test_device = torch.device('cuda:0') if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.BertModel.from_torch(model) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.AlbertModel.from_torch(model) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.RobertaModel.from_torch(model) else: raise (f"benchmark does not support {model_name}") cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), True, n, batch_size, seq_len, "turbo")
def benchmark_torch(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool, enable_mem_opt: bool): import torch import transformers import benchmark_helper test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') torch.set_grad_enabled(False) torch.set_num_threads(num_threads) cfg = None if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) elif model_name == "distilbert": cfg = transformers.DistilBertConfig() model = transformers.DistilBertModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() model.to(test_device) # cfg = model.config # type: transformers.BertConfig if enable_random: benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len, min_seq_len, "torch", num_threads, cfg, enable_mem_opt, model_name) else: input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n, batch_size, seq_len, "torch", num_threads, enable_mem_opt, model_name)
def test(loadtype: LoadType, use_cuda: bool): cfg = transformers.AlbertConfig(hidden_size=768, num_attention_heads=12, intermediate_size=3072) model = transformers.AlbertModel(cfg) model.eval() torch.set_grad_enabled(False) test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') cfg = model.config # use 4 threads for computing turbo_transformers.set_num_threads(4) input_ids = torch.tensor( ([12166, 10699, 16752, 4454], [5342, 16471, 817, 16022]), dtype=torch.long) model.to(test_device) start_time = time.time() for _ in range(10): torch_res = model(input_ids) end_time = time.time() print("\ntorch time consum: {}".format(end_time - start_time)) # there are three ways to load pretrained model. if loadtype is LoadType.PYTORCH: # 1, from a PyTorch model, which has loaded a pretrained model tt_model = turbo_transformers.AlbertModel.from_torch(model) else: raise ("LoadType is not supported") start_time = time.time() for _ in range(10): res = tt_model(input_ids) # sequence_output, pooled_output end_time = time.time() print("\nturbo time consum: {}".format(end_time - start_time)) assert (numpy.max( numpy.abs(res[0].cpu().numpy() - torch_res[0].cpu().numpy())) < 0.1)