def create_longformer_test_data(
    model,
    output_dir,
    batch_size,
    sequence_length,
    test_cases,
    seed,
    verbose,
    input_ids_name,
    input_mask_name,
    global_mask_name,
    num_global_tokens,
):

    input_ids, input_mask, global_mask = get_longformer_inputs(
        model, input_ids_name, input_mask_name, global_mask_name)
    all_inputs = generate_test_data(
        batch_size,
        sequence_length,
        test_cases,
        seed,
        verbose,
        input_ids,
        input_mask,
        global_mask,
        num_global_tokens,
    )

    for i, inputs in enumerate(all_inputs):
        output_test_data(output_dir, i, inputs)
Exemple #2
0
def run_test(baseline_model, optimized_model, output_dir, batch_size,
             sequence_length, use_gpu, test_cases, seed, use_openmp, verbose,
             rtol, atol, input_ids_name, segment_ids_name, input_mask_name):

    # Try deduce input names from optimized model.
    input_ids, segment_ids, input_mask = get_bert_inputs(
        optimized_model, input_ids_name, segment_ids_name, input_mask_name)

    # Use random mask length for accuracy test. It might introduce slight inflation in latency reported in this script.
    all_inputs = generate_test_data(batch_size,
                                    sequence_length,
                                    test_cases,
                                    seed,
                                    verbose,
                                    input_ids,
                                    segment_ids,
                                    input_mask,
                                    random_mask_length=True)

    # OpenMP environment variables must be set before the very first "import onnxruntime"
    if use_openmp:
        setup_openmp_environ(omp_num_threads=psutil.cpu_count(logical=False),
                             omp_wait_policy='ACTIVE')
    else:
        setup_openmp_environ(omp_num_threads=1, omp_wait_policy='ACTIVE')

    baseline_results, baseline_latency, output_names = run_model(
        baseline_model,
        all_inputs,
        use_gpu,
        use_openmp,
        disable_optimization=True)
    if verbose:
        print("baseline average latency (all optimizations disabled): {} ms".
              format(statistics.mean(baseline_latency) * 1000))

    if output_dir is not None:
        for i, inputs in enumerate(all_inputs):
            output_test_data(output_dir, i, inputs)

    treatment_results, treatment_latency, treatment_output_names = run_model(
        optimized_model,
        all_inputs,
        use_gpu,
        use_openmp,
        disable_optimization=False)
    if verbose:
        print("treatment average latency: {} ms".format(
            statistics.mean(treatment_latency) * 1000))

    # Validate the output of baseline and treatment, to make sure the results are similar.
    compare(baseline_results, treatment_results, verbose, rtol, atol)
Exemple #3
0
def run_test(
    baseline_model,
    optimized_model,
    output_dir,
    batch_size,
    sequence_length,
    use_gpu,
    test_cases,
    seed,
    verbose,
    rtol,
    atol,
    input_ids_name,
    segment_ids_name,
    input_mask_name,
):

    # Try deduce input names from optimized model.
    input_ids, segment_ids, input_mask = get_bert_inputs(
        optimized_model, input_ids_name, segment_ids_name, input_mask_name)

    # Use random mask length for accuracy test. It might introduce slight inflation in latency reported in this script.
    all_inputs = generate_test_data(
        batch_size,
        sequence_length,
        test_cases,
        seed,
        verbose,
        input_ids,
        segment_ids,
        input_mask,
        random_mask_length=True,
    )

    baseline_results, baseline_latency, output_names = run_model(
        baseline_model, all_inputs, use_gpu, disable_optimization=True)
    if verbose:
        print("baseline average latency (all optimizations disabled): {} ms".
              format(statistics.mean(baseline_latency) * 1000))

    if output_dir is not None:
        for i, inputs in enumerate(all_inputs):
            output_test_data(output_dir, i, inputs)

    treatment_results, treatment_latency, treatment_output_names = run_model(
        optimized_model, all_inputs, use_gpu, disable_optimization=False)
    if verbose:
        print("treatment average latency: {} ms".format(
            statistics.mean(treatment_latency) * 1000))

    # Validate the output of baseline and treatment, to make sure the results are similar.
    compare(baseline_results, treatment_results, verbose, rtol, atol)
def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
    if args.model_type != "gpt2":
        print(
            f"Skipping parity test since the support for model type {args.model_type} is not implemented in OnnxRuntime"
        )
        return True

    if args.temperature != 1.0:
        # TODO: implement temperature in BeamSearch operator.
        print("Skipping parity test as temperature is not implemented in BeamSearch operator")
        return True

    if args.prefix_vocab_mask:
        print("Skipping parity test as prefix vocab mask is not implemented by Hugging Face")
        return True

    from transformers import GPT2LMHeadModel, GPT2Tokenizer

    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
    tokenizer.padding_side = "left"
    tokenizer.pad_token = tokenizer.eos_token

    model = GPT2LMHeadModel.from_pretrained(
        args.model_name_or_path,
        cache_dir=args.cache_dir,
        pad_token_id=tokenizer.eos_token_id,
    )

    # Use different length sentences to test batching
    if sentences is None:
        sentences = [
            "The product is released",
            "I enjoy walking in the park",
            "Test best way to invest",
        ]

    inputs = tokenizer(sentences, return_tensors="pt", padding=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    bad_words = "walk in park"
    bad_words_ids = tokenizer.encode(bad_words, add_prefix_space=True)
    bad_words_ids = [[word_id] for word_id in bad_words_ids]  # Convert to list of list
    if use_vocab_mask:
        print("bad_words_ids", bad_words_ids)
    else:
        bad_words_ids = None

    global config
    config = model.config
    eos_token_id = config.eos_token_id
    pad_token_id = config.eos_token_id
    vocab_size = config.vocab_size

    torch_decoded_sequences = []
    if not args.disable_parity:
        print("-" * 50)
        print("Test PyTorch model and beam search with huggingface transformers...")
        beam_outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=args.max_length,
            min_length=args.min_length,
            num_beams=args.num_beams,
            early_stopping=args.early_stopping,
            no_repeat_ngram_size=args.no_repeat_ngram_size,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            num_return_sequences=args.num_return_sequences,
            temperature=args.temperature,
            length_penalty=args.length_penalty,
            repetition_penalty=args.repetition_penalty,
            bad_words_ids=bad_words_ids,
            return_dict_in_generate=True,
            output_scores=args.output_sequences_scores or args.output_token_scores,
        )
        print("input_ids", input_ids)
        print("huggingface transformers outputs:")
        print("sequences", beam_outputs.sequences)
        if args.output_sequences_scores:
            print("sequences_scores", beam_outputs.sequences_scores)
        if args.output_token_scores:
            print("scores", beam_outputs.scores)
        for i, sequence in enumerate(beam_outputs.sequences):
            decoded_sequence = tokenizer.decode(sequence, skip_special_tokens=True)
            torch_decoded_sequences.append(decoded_sequence)
            print("{}: {}".format(i, decoded_sequence))

    print("-" * 50)
    print("Test ONNX model and bream search with onnxruntime...")

    ort_session = create_ort_session(args.output, args.use_gpu)

    vocab_mask = np.ones((vocab_size), dtype=np.int32)
    if use_vocab_mask:
        for bad_word_id in bad_words_ids:
            vocab_mask[bad_word_id] = 0

    inputs = {
        "input_ids": input_ids.cpu().numpy().astype(np.int32),
        "max_length": np.array([args.max_length], dtype=np.int32),
        "min_length": np.array([args.min_length], dtype=np.int32),
        "num_beams": np.array([args.num_beams], dtype=np.int32),
        "num_return_sequences": np.array([args.num_return_sequences], dtype=np.int32),
        "temperature": np.array([args.temperature], dtype=np.float32),
        "length_penalty": np.array([args.length_penalty], dtype=np.float32),
        "repetition_penalty": np.array([args.repetition_penalty], dtype=np.float32),
        "vocab_mask": vocab_mask,
    }

    test_data_dir = Path(args.output).parent.as_posix()
    print("test_data_dir", test_data_dir)
    from bert_test_data import output_test_data

    all_inputs = [inputs]
    for i, inputs in enumerate(all_inputs):
        dir = os.path.join(test_data_dir, "test_data_set_" + str(i))
        output_test_data(dir, inputs)

    print("inputs", inputs)

    # Test performance
    latency = []
    for _ in range(args.total_runs):
        start = time.time()
        result = ort_session.run(None, inputs)
        latency.append(time.time() - start)
    batch_size = input_ids.shape[0]
    from benchmark_helper import get_latency_result

    output = get_latency_result(latency, batch_size)

    print("ORT outputs:")
    sequences = result[0]
    print("sequences", sequences)
    if args.output_sequences_scores:
        print("sequences_scores", result[1])
    if args.output_token_scores:
        print("scores", result[2])

    (batch_size, num_sequences, max_length) = sequences.shape
    ort_decoded_sequences = []
    for i in range(batch_size):
        for j in range(num_sequences):
            decoded_sequence = tokenizer.decode(sequences[i][j], skip_special_tokens=True)
            ort_decoded_sequences.append(decoded_sequence)
            print(f"batch {i} sequence {j}: {decoded_sequence}")

    if not args.disable_parity:
        torch_sequences = beam_outputs.sequences.reshape(batch_size, args.num_return_sequences, -1)
        ort_sequences = torch.LongTensor(sequences)
        print("-" * 50)
        print("Torch Sequences:")
        print(torch_sequences)
        print(torch_decoded_sequences)
        print("-" * 50)
        print("ORT Sequences:")
        print(ort_sequences)
        print(ort_decoded_sequences)
        print("-" * 50)
        # Compare the generated text instead of word IDs since ORT pads to max sequence length but Torch not.
        is_same = torch_decoded_sequences == ort_decoded_sequences
        print("Torch and ORT result is ", "same" if is_same else "different")
        output["parity"] = is_same

    if args.torch_performance:
        torch_latency_output = test_torch_performance(
            args,
            model,
            input_ids,
            attention_mask,
            eos_token_id,
            pad_token_id,
            bad_words_ids,
        )
        print("Torch Latency", torch_latency_output)

    print("ORT", output)
    return output