def create_longformer_test_data( model, output_dir, batch_size, sequence_length, test_cases, seed, verbose, input_ids_name, input_mask_name, global_mask_name, num_global_tokens, ): input_ids, input_mask, global_mask = get_longformer_inputs( model, input_ids_name, input_mask_name, global_mask_name) all_inputs = generate_test_data( batch_size, sequence_length, test_cases, seed, verbose, input_ids, input_mask, global_mask, num_global_tokens, ) for i, inputs in enumerate(all_inputs): output_test_data(output_dir, i, inputs)
def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_length, use_gpu, test_cases, seed, use_openmp, verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name): # Try deduce input names from optimized model. input_ids, segment_ids, input_mask = get_bert_inputs( optimized_model, input_ids_name, segment_ids_name, input_mask_name) # Use random mask length for accuracy test. It might introduce slight inflation in latency reported in this script. all_inputs = generate_test_data(batch_size, sequence_length, test_cases, seed, verbose, input_ids, segment_ids, input_mask, random_mask_length=True) # OpenMP environment variables must be set before the very first "import onnxruntime" if use_openmp: setup_openmp_environ(omp_num_threads=psutil.cpu_count(logical=False), omp_wait_policy='ACTIVE') else: setup_openmp_environ(omp_num_threads=1, omp_wait_policy='ACTIVE') baseline_results, baseline_latency, output_names = run_model( baseline_model, all_inputs, use_gpu, use_openmp, disable_optimization=True) if verbose: print("baseline average latency (all optimizations disabled): {} ms". format(statistics.mean(baseline_latency) * 1000)) if output_dir is not None: for i, inputs in enumerate(all_inputs): output_test_data(output_dir, i, inputs) treatment_results, treatment_latency, treatment_output_names = run_model( optimized_model, all_inputs, use_gpu, use_openmp, disable_optimization=False) if verbose: print("treatment average latency: {} ms".format( statistics.mean(treatment_latency) * 1000)) # Validate the output of baseline and treatment, to make sure the results are similar. compare(baseline_results, treatment_results, verbose, rtol, atol)
def run_test( baseline_model, optimized_model, output_dir, batch_size, sequence_length, use_gpu, test_cases, seed, verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name, ): # Try deduce input names from optimized model. input_ids, segment_ids, input_mask = get_bert_inputs( optimized_model, input_ids_name, segment_ids_name, input_mask_name) # Use random mask length for accuracy test. It might introduce slight inflation in latency reported in this script. all_inputs = generate_test_data( batch_size, sequence_length, test_cases, seed, verbose, input_ids, segment_ids, input_mask, random_mask_length=True, ) baseline_results, baseline_latency, output_names = run_model( baseline_model, all_inputs, use_gpu, disable_optimization=True) if verbose: print("baseline average latency (all optimizations disabled): {} ms". format(statistics.mean(baseline_latency) * 1000)) if output_dir is not None: for i, inputs in enumerate(all_inputs): output_test_data(output_dir, i, inputs) treatment_results, treatment_latency, treatment_output_names = run_model( optimized_model, all_inputs, use_gpu, disable_optimization=False) if verbose: print("treatment average latency: {} ms".format( statistics.mean(treatment_latency) * 1000)) # Validate the output of baseline and treatment, to make sure the results are similar. compare(baseline_results, treatment_results, verbose, rtol, atol)
def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None): if args.model_type != "gpt2": print( f"Skipping parity test since the support for model type {args.model_type} is not implemented in OnnxRuntime" ) return True if args.temperature != 1.0: # TODO: implement temperature in BeamSearch operator. print("Skipping parity test as temperature is not implemented in BeamSearch operator") return True if args.prefix_vocab_mask: print("Skipping parity test as prefix vocab mask is not implemented by Hugging Face") return True from transformers import GPT2LMHeadModel, GPT2Tokenizer tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) tokenizer.padding_side = "left" tokenizer.pad_token = tokenizer.eos_token model = GPT2LMHeadModel.from_pretrained( args.model_name_or_path, cache_dir=args.cache_dir, pad_token_id=tokenizer.eos_token_id, ) # Use different length sentences to test batching if sentences is None: sentences = [ "The product is released", "I enjoy walking in the park", "Test best way to invest", ] inputs = tokenizer(sentences, return_tensors="pt", padding=True) input_ids = inputs["input_ids"] attention_mask = inputs["attention_mask"] bad_words = "walk in park" bad_words_ids = tokenizer.encode(bad_words, add_prefix_space=True) bad_words_ids = [[word_id] for word_id in bad_words_ids] # Convert to list of list if use_vocab_mask: print("bad_words_ids", bad_words_ids) else: bad_words_ids = None global config config = model.config eos_token_id = config.eos_token_id pad_token_id = config.eos_token_id vocab_size = config.vocab_size torch_decoded_sequences = [] if not args.disable_parity: print("-" * 50) print("Test PyTorch model and beam search with huggingface transformers...") beam_outputs = model.generate( input_ids=input_ids, attention_mask=attention_mask, max_length=args.max_length, min_length=args.min_length, num_beams=args.num_beams, early_stopping=args.early_stopping, no_repeat_ngram_size=args.no_repeat_ngram_size, eos_token_id=eos_token_id, pad_token_id=pad_token_id, num_return_sequences=args.num_return_sequences, temperature=args.temperature, length_penalty=args.length_penalty, repetition_penalty=args.repetition_penalty, bad_words_ids=bad_words_ids, return_dict_in_generate=True, output_scores=args.output_sequences_scores or args.output_token_scores, ) print("input_ids", input_ids) print("huggingface transformers outputs:") print("sequences", beam_outputs.sequences) if args.output_sequences_scores: print("sequences_scores", beam_outputs.sequences_scores) if args.output_token_scores: print("scores", beam_outputs.scores) for i, sequence in enumerate(beam_outputs.sequences): decoded_sequence = tokenizer.decode(sequence, skip_special_tokens=True) torch_decoded_sequences.append(decoded_sequence) print("{}: {}".format(i, decoded_sequence)) print("-" * 50) print("Test ONNX model and bream search with onnxruntime...") ort_session = create_ort_session(args.output, args.use_gpu) vocab_mask = np.ones((vocab_size), dtype=np.int32) if use_vocab_mask: for bad_word_id in bad_words_ids: vocab_mask[bad_word_id] = 0 inputs = { "input_ids": input_ids.cpu().numpy().astype(np.int32), "max_length": np.array([args.max_length], dtype=np.int32), "min_length": np.array([args.min_length], dtype=np.int32), "num_beams": np.array([args.num_beams], dtype=np.int32), "num_return_sequences": np.array([args.num_return_sequences], dtype=np.int32), "temperature": np.array([args.temperature], dtype=np.float32), "length_penalty": np.array([args.length_penalty], dtype=np.float32), "repetition_penalty": np.array([args.repetition_penalty], dtype=np.float32), "vocab_mask": vocab_mask, } test_data_dir = Path(args.output).parent.as_posix() print("test_data_dir", test_data_dir) from bert_test_data import output_test_data all_inputs = [inputs] for i, inputs in enumerate(all_inputs): dir = os.path.join(test_data_dir, "test_data_set_" + str(i)) output_test_data(dir, inputs) print("inputs", inputs) # Test performance latency = [] for _ in range(args.total_runs): start = time.time() result = ort_session.run(None, inputs) latency.append(time.time() - start) batch_size = input_ids.shape[0] from benchmark_helper import get_latency_result output = get_latency_result(latency, batch_size) print("ORT outputs:") sequences = result[0] print("sequences", sequences) if args.output_sequences_scores: print("sequences_scores", result[1]) if args.output_token_scores: print("scores", result[2]) (batch_size, num_sequences, max_length) = sequences.shape ort_decoded_sequences = [] for i in range(batch_size): for j in range(num_sequences): decoded_sequence = tokenizer.decode(sequences[i][j], skip_special_tokens=True) ort_decoded_sequences.append(decoded_sequence) print(f"batch {i} sequence {j}: {decoded_sequence}") if not args.disable_parity: torch_sequences = beam_outputs.sequences.reshape(batch_size, args.num_return_sequences, -1) ort_sequences = torch.LongTensor(sequences) print("-" * 50) print("Torch Sequences:") print(torch_sequences) print(torch_decoded_sequences) print("-" * 50) print("ORT Sequences:") print(ort_sequences) print(ort_decoded_sequences) print("-" * 50) # Compare the generated text instead of word IDs since ORT pads to max sequence length but Torch not. is_same = torch_decoded_sequences == ort_decoded_sequences print("Torch and ORT result is ", "same" if is_same else "different") output["parity"] = is_same if args.torch_performance: torch_latency_output = test_torch_performance( args, model, input_ids, attention_mask, eos_token_id, pad_token_id, bad_words_ids, ) print("Torch Latency", torch_latency_output) print("ORT", output) return output