def test_torch_latency( device, model, model_name, batch_sizes, sequence_lengths, global_lengths, test_times, num_threads, verbose, ): if num_threads > 0: torch.set_num_threads(num_threads) results = [] for batch_size in batch_sizes: for sequence_length in sequence_lengths: for global_length in global_lengths: print( f"batch_size={batch_size} sequence_length={sequence_length} global_length={global_length}..." ) inputs: LongformerInputs = LongformerHelper.get_dummy_inputs( batch_size, sequence_length, global_length, device) input_list = inputs.to_list() _ = model(*input_list) runtimes = timeit.repeat(lambda: model(*input_list), repeat=test_times, number=1) result = { "engine": "torch", # TODO: test torchscript "version": torch.__version__, "device": "cuda", "optimizer": "", "precision": "fp32", "io_binding": "", "model_name": model_name, "description": model_name + " [torch]", "inputs": 3, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "global_length": global_length, "datetime": str(datetime.now()), "memory": "NA", "diff_max": 0, "diff_90_percentile": 0, "diff_95_percentile": 0, "diff_99_percentile": 0, "use_compact_memory": "NA", } result.update( benchmark_helper.get_latency_result(runtimes, batch_size)) print(result) results.append(result) return results
def test_torch(device, model, model_name, batch_sizes, sequence_lengths, global_lengths, test_times, num_threads): # Comment the following so that PyTorch use default setting as well. #if num_threads <= 0: # import psutil # num_threads = psutil.cpu_count(logical=False) if num_threads > 0: torch.set_num_threads(num_threads) results = [] for batch_size in batch_sizes: for sequence_length in sequence_lengths: # This is total length of <query, document>. for global_length in global_lengths: # This is length of <query>. Short query (8) for search keywords, and longer query (16) for question like print( f"batch_size={batch_size} sequence_length={sequence_length} global_length={global_length}..." ) input_ids, attention_mask, global_attention_mask = get_dummy_inputs( sequence_length, global_length, device) # Run PyTorch _ = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask) runtimes = timeit.repeat(lambda: model( input_ids, attention_mask, global_attention_mask), repeat=test_times, number=1) result = { "engine": "torch", #TODO: test torchscript "version": torch.__version__, "device": "cuda", "optimizer": "", "precision": "fp32", "io_binding": "", "model_name": model_name, "inputs": 3, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "global_length": global_length, "datetime": str(datetime.now()), } result.update( benchmark_helper.get_latency_result(runtimes, batch_size)) print(result) results.append(result) return results
def test_torch_performance(args, model, input_ids, attention_mask, eos_token_id, pad_token_id, bad_words_ids): if args.use_gpu and not torch.cuda.is_available(): logger.error("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.") return None if args.precision == Precision.FLOAT16: model.half() device = torch.device("cuda:0" if args.use_gpu else "cpu") model.to(device) torch.set_grad_enabled(False) input_ids = input_ids.to(device) attention_mask = attention_mask.to(device) torch_latency = [] for _ in range(args.total_runs): start = time.time() _ = model.generate( input_ids=input_ids, attention_mask=attention_mask, max_length=args.max_length, min_length=args.min_length, num_beams=args.num_beams, early_stopping=args.early_stopping, no_repeat_ngram_size=args.no_repeat_ngram_size, eos_token_id=eos_token_id, pad_token_id=pad_token_id, num_return_sequences=args.num_return_sequences, temperature=args.temperature, length_penalty=args.length_penalty, repetition_penalty=args.repetition_penalty, bad_words_ids=bad_words_ids, return_dict_in_generate=True, output_scores=args.output_sequences_scores or args.output_token_scores, ) torch_latency.append(time.time() - start) batch_size = input_ids.shape[0] from benchmark_helper import get_latency_result return get_latency_result(torch_latency, batch_size)
def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, cache_dir, verbose): results = [] import tensorflow as tf tf.config.threading.set_intra_op_parallelism_threads(num_threads) if not use_gpu: tf.config.set_visible_devices([], 'GPU') if use_gpu and not tf.test.is_built_with_cuda(): logger.error( "Please install Tensorflow-gpu, and use a machine with GPU for testing gpu performance." ) return results if use_gpu: # Restrict TensorFlow to only use the first GPU physical_devices = tf.config.list_physical_devices('GPU') try: tf.config.set_visible_devices(physical_devices[0], 'GPU') except RuntimeError as e: logger.exception(e) if precision == Precision.FLOAT16 or precision == Precision.INT8: raise NotImplementedError( "Mixed precision is currently not supported.") for model_name in model_names: config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir) model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class, is_tf_model=True) tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) max_input_size = tokenizer.max_model_input_sizes[ model_name] if model_name in tokenizer.max_model_input_sizes else 1024 for batch_size in batch_sizes: if batch_size <= 0: continue for sequence_length in sequence_lengths: if max_input_size is not None and sequence_length > max_input_size: continue logger.info("Run Tensorflow on {} with input shape {}".format( model_name, [batch_size, sequence_length])) import random rng = random.Random() values = [ rng.randint(0, config.vocab_size - 1) for i in range(batch_size * sequence_length) ] input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32) try: def encoder_forward(): return model(input_ids, training=False) def encoder_decoder_forward(): return model(input_ids, decoder_input_ids=input_ids, training=False) inference = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward inference() runtimes = timeit.repeat(lambda: inference(), repeat=repeat_times, number=1) result = { "engine": "tensorflow", "version": tf.__version__, "device": "cuda" if use_gpu else "cpu", "optimizer": "", "precision": precision, "io_binding": "", "model_name": model_name, "inputs": 1, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "datetime": str(datetime.now()), } result.update(get_latency_result(runtimes, batch_size)) logger.info(result) results.append(result) except RuntimeError as e: logger.exception(e) from numba import cuda device = cuda.get_current_device() device.reset() return results
def run_pytorch(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, torchscript, cache_dir, verbose): results = [] if use_gpu and not torch.cuda.is_available(): logger.error( "Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance." ) return results torch.set_grad_enabled(False) for model_name in model_names: config = AutoConfig.from_pretrained(model_name, torchscript=torchscript, cache_dir=cache_dir) model = load_pretrained_model(model_name, config=config, cache_dir=cache_dir, custom_model_class=model_class) tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) max_input_size = tokenizer.max_model_input_sizes[ model_name] if model_name in tokenizer.max_model_input_sizes else 1024 logger.debug(f"Model {model}") logger.debug(f"Number of parameters {model.num_parameters()}") if precision == Precision.FLOAT16: model.half() device = torch.device("cuda:0" if use_gpu else "cpu") model.to(device) if precision == Precision.INT8: model = QuantizeHelper.quantize_torch_model(model) for batch_size in batch_sizes: if batch_size <= 0: continue for sequence_length in sequence_lengths: if max_input_size is not None and sequence_length > max_input_size: continue logger.info("Run PyTorch on {} with input shape {}".format( model_name, [batch_size, sequence_length])) input_ids = torch.randint(low=0, high=config.vocab_size - 1, size=(batch_size, sequence_length), dtype=torch.long, device=device) try: inference = torch.jit.trace( model, input_ids) if torchscript else model inference(input_ids) runtimes = timeit.repeat(lambda: inference(input_ids), repeat=repeat_times, number=1) result = { "engine": "torchscript" if torchscript else "torch", "version": torch.__version__, "device": "cuda" if use_gpu else "cpu", "optimizer": "", "precision": precision, "io_binding": "", "model_name": model_name, "inputs": 1, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "datetime": str(datetime.now()), } result.update(get_latency_result(runtimes, batch_size)) logger.info(result) results.append(result) except RuntimeError as e: logger.exception(e) torch.cuda.empty_cache() return results
def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None): if args.model_type != "gpt2": print( f"Skipping parity test since the support for model type {args.model_type} is not implemented in OnnxRuntime" ) return True if args.temperature != 1.0: # TODO: implement temperature in BeamSearch operator. print("Skipping parity test as temperature is not implemented in BeamSearch operator") return True if args.prefix_vocab_mask: print("Skipping parity test as prefix vocab mask is not implemented by Hugging Face") return True from transformers import GPT2LMHeadModel, GPT2Tokenizer tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) tokenizer.padding_side = "left" tokenizer.pad_token = tokenizer.eos_token model = GPT2LMHeadModel.from_pretrained( args.model_name_or_path, cache_dir=args.cache_dir, pad_token_id=tokenizer.eos_token_id, ) # Use different length sentences to test batching if sentences is None: sentences = [ "The product is released", "I enjoy walking in the park", "Test best way to invest", ] inputs = tokenizer(sentences, return_tensors="pt", padding=True) input_ids = inputs["input_ids"] attention_mask = inputs["attention_mask"] bad_words = "walk in park" bad_words_ids = tokenizer.encode(bad_words, add_prefix_space=True) bad_words_ids = [[word_id] for word_id in bad_words_ids] # Convert to list of list if use_vocab_mask: print("bad_words_ids", bad_words_ids) else: bad_words_ids = None global config config = model.config eos_token_id = config.eos_token_id pad_token_id = config.eos_token_id vocab_size = config.vocab_size torch_decoded_sequences = [] if not args.disable_parity: print("-" * 50) print("Test PyTorch model and beam search with huggingface transformers...") beam_outputs = model.generate( input_ids=input_ids, attention_mask=attention_mask, max_length=args.max_length, min_length=args.min_length, num_beams=args.num_beams, early_stopping=args.early_stopping, no_repeat_ngram_size=args.no_repeat_ngram_size, eos_token_id=eos_token_id, pad_token_id=pad_token_id, num_return_sequences=args.num_return_sequences, temperature=args.temperature, length_penalty=args.length_penalty, repetition_penalty=args.repetition_penalty, bad_words_ids=bad_words_ids, return_dict_in_generate=True, output_scores=args.output_sequences_scores or args.output_token_scores, ) print("input_ids", input_ids) print("huggingface transformers outputs:") print("sequences", beam_outputs.sequences) if args.output_sequences_scores: print("sequences_scores", beam_outputs.sequences_scores) if args.output_token_scores: print("scores", beam_outputs.scores) for i, sequence in enumerate(beam_outputs.sequences): decoded_sequence = tokenizer.decode(sequence, skip_special_tokens=True) torch_decoded_sequences.append(decoded_sequence) print("{}: {}".format(i, decoded_sequence)) print("-" * 50) print("Test ONNX model and bream search with onnxruntime...") ort_session = create_ort_session(args.output, args.use_gpu) vocab_mask = np.ones((vocab_size), dtype=np.int32) if use_vocab_mask: for bad_word_id in bad_words_ids: vocab_mask[bad_word_id] = 0 inputs = { "input_ids": input_ids.cpu().numpy().astype(np.int32), "max_length": np.array([args.max_length], dtype=np.int32), "min_length": np.array([args.min_length], dtype=np.int32), "num_beams": np.array([args.num_beams], dtype=np.int32), "num_return_sequences": np.array([args.num_return_sequences], dtype=np.int32), "temperature": np.array([args.temperature], dtype=np.float32), "length_penalty": np.array([args.length_penalty], dtype=np.float32), "repetition_penalty": np.array([args.repetition_penalty], dtype=np.float32), "vocab_mask": vocab_mask, } test_data_dir = Path(args.output).parent.as_posix() print("test_data_dir", test_data_dir) from bert_test_data import output_test_data all_inputs = [inputs] for i, inputs in enumerate(all_inputs): dir = os.path.join(test_data_dir, "test_data_set_" + str(i)) output_test_data(dir, inputs) print("inputs", inputs) # Test performance latency = [] for _ in range(args.total_runs): start = time.time() result = ort_session.run(None, inputs) latency.append(time.time() - start) batch_size = input_ids.shape[0] from benchmark_helper import get_latency_result output = get_latency_result(latency, batch_size) print("ORT outputs:") sequences = result[0] print("sequences", sequences) if args.output_sequences_scores: print("sequences_scores", result[1]) if args.output_token_scores: print("scores", result[2]) (batch_size, num_sequences, max_length) = sequences.shape ort_decoded_sequences = [] for i in range(batch_size): for j in range(num_sequences): decoded_sequence = tokenizer.decode(sequences[i][j], skip_special_tokens=True) ort_decoded_sequences.append(decoded_sequence) print(f"batch {i} sequence {j}: {decoded_sequence}") if not args.disable_parity: torch_sequences = beam_outputs.sequences.reshape(batch_size, args.num_return_sequences, -1) ort_sequences = torch.LongTensor(sequences) print("-" * 50) print("Torch Sequences:") print(torch_sequences) print(torch_decoded_sequences) print("-" * 50) print("ORT Sequences:") print(ort_sequences) print(ort_decoded_sequences) print("-" * 50) # Compare the generated text instead of word IDs since ORT pads to max sequence length but Torch not. is_same = torch_decoded_sequences == ort_decoded_sequences print("Torch and ORT result is ", "same" if is_same else "different") output["parity"] = is_same if args.torch_performance: torch_latency_output = test_torch_performance( args, model, input_ids, attention_mask, eos_token_id, pad_token_id, bad_words_ids, ) print("Torch Latency", torch_latency_output) print("ORT", output) return output