Python ServerStatusContext Exemples, tensorrtserver.api.ServerStatusContext Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : client_grpc.py Projet : zhouyingchaoAI/TensorRT-Inference-Server-Tutorial

def _parse_model(url, model_name, verbose=False):
    protocol = ProtocolType.from_str('gRPC')
    ctx = ServerStatusContext(url, protocol, model_name, verbose)
    server_status = ctx.get_server_status()

    if model_name not in server_status.model_status:
        raise Exception("unable to get status for '" + model_name + "'")

    status = server_status.model_status[model_name]
    config = status.config

    return config

Exemple #2

0

Afficher le fichier

 def check_model_status(self) -> bool:
     name = self.model_info.architecture
     version = self.model_info.version.ver
     ctx = ServerStatusContext(self.SERVER_URI, ProtocolType.GRPC,
                               self.model_info.architecture)
     try:
         server_status: ServerStatus = ctx.get_server_status()
         if server_status.model_status[name].version_status[
                 version].ready_state == 1:
             return True
         else:
             return False
     except InferenceServerException as e:
         print(e, file=sys.stderr)
         return False

Exemple #3

0

Afficher le fichier

def parse_model(url, protocol, model_name, verbose=False):
    """
    """

    ctx = ServerStatusContext(url, protocol, model_name, verbose)
    server_status = ctx.get_server_status()

    if model_name not in server_status.model_status:
        raise Exception("unable to get status for '" + model_name + "'")

    status = server_status.model_status[model_name]
    config = status.config

    # Model specifying maximum batch size of 0 indicates that batching
    # is not supported and so the input tensors do not expect an "N"
    # dimension (and 'batch_size' should be 1 so that only a single
    # image instance is inferred at a time).
    print(f"max_batch_size = {config.max_batch_size}")

    input_names = []
    for idx, each_input in enumerate(config.input):
        input_names.append(each_input.name)
        print("---------")
        print(f"input {idx}, name = {each_input.name}")
        print(f"input.dims = {each_input.dims}")
        print(
            f"input type = {model_config.DataType.Name(each_input.data_type)}")
        print(
            f"input format = {model_config.ModelInput.Format.Name(each_input.format)}"
        )

    output_names = []
    for idx, each_output in enumerate(config.output):
        output_names.append(each_output.name)
        print("---------")
        print(f"output {idx}, name = {each_output.name}")
        print(f"output.dims = {each_output.dims}")
        print(
            f"output type = {model_config.DataType.Name(each_output.data_type)}"
        )

    return input_names, output_names

Exemple #4

0

Afficher le fichier

Fichier : run_squad_trtis_client.py Projet : colabnlp/build_engines

def run_client():
    """
    Ask a question of context on TRTIS.
    :param context: str
    :param question: str
    :param question_id: int
    :return:
    """

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    eval_examples = read_squad_examples(
        input_file=FLAGS.predict_file,
        is_training=False,
        version_2_with_negative=FLAGS.version_2_with_negative)

    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)

    convert_examples_to_features(examples=eval_examples[0:],
                                 tokenizer=tokenizer,
                                 max_seq_length=FLAGS.max_seq_length,
                                 doc_stride=FLAGS.doc_stride,
                                 max_query_length=FLAGS.max_query_length,
                                 is_training=False,
                                 output_fn=append_feature)

    protocol_str = 'grpc'  # http or grpc
    url = FLAGS.trtis_server_url
    verbose = True
    model_name = FLAGS.trtis_model_name
    model_version = FLAGS.trtis_model_version
    batch_size = FLAGS.predict_batch_size

    protocol = ProtocolType.from_str(protocol_str)  # or 'grpc'

    ctx = InferContext(url, protocol, model_name, model_version, verbose)

    channel = grpc.insecure_channel(url)

    stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)

    prof_request = grpc_service_pb2.server__status__pb2.model__config__pb2.ModelConfig(
    )

    prof_response = stub.Profile(prof_request)

    status_ctx = ServerStatusContext(url,
                                     protocol,
                                     model_name=model_name,
                                     verbose=verbose)

    model_config_pb2.ModelConfig()

    status_result = status_ctx.get_server_status()

    outstanding = {}
    max_outstanding = 20

    sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features))
    recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features))

    def process_outstanding(do_wait):

        if (len(outstanding) == 0):
            return

        ready_id = ctx.get_ready_async_request(do_wait)

        if (ready_id is None):
            return

        # If we are here, we got an id
        result = ctx.get_async_run_results(ready_id, False)
        stop = time.time()

        if (result is None):
            raise ValueError(
                "Context returned null for async id marked as done")

        outResult = outstanding.pop(ready_id)

        time_list.append(stop - outResult.start_time)

        batch_count = len(outResult.inputs[label_id_key])

        for i in range(batch_count):
            unique_id = int(outResult.inputs[label_id_key][i][0])
            start_logits = [float(x) for x in result["start_logits"][i].flat]
            end_logits = [float(x) for x in result["end_logits"][i].flat]
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

        recv_prog.update(n=batch_count)

    all_results = []
    time_list = []

    print("Starting Sending Requests....\n")

    all_results_start = time.time()

    for inputs_dict in batch(eval_features, batch_size):

        present_batch_size = len(inputs_dict[label_id_key])

        outputs_dict = {
            'start_logits': InferContext.ResultFormat.RAW,
            'end_logits': InferContext.ResultFormat.RAW
        }

        start = time.time()
        async_id = ctx.async_run(inputs_dict,
                                 outputs_dict,
                                 batch_size=present_batch_size)

        outstanding[async_id] = PendingResult(async_id=async_id,
                                              start_time=start,
                                              inputs=inputs_dict)

        sent_prog.update(n=present_batch_size)

        # Try to process at least one response per request
        process_outstanding(len(outstanding) >= max_outstanding)

    tqdm.tqdm.write(
        "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format(
            len(outstanding)))

    # Now process all outstanding requests
    while (len(outstanding) > 0):
        process_outstanding(True)

    all_results_end = time.time()
    all_results_total = (all_results_end - all_results_start) * 1000.0

    print("-----------------------------")
    print("Individual Time Runs - Ignoring first two iterations")
    print("Total Time: {} ms".format(all_results_total))
    print("-----------------------------")

    print("-----------------------------")
    print("Total Inference Time = %0.2f for"
          "Sentences processed = %d" % (sum(time_list), len(eval_features)))
    print("Throughput Average (sentences/sec) = %0.2f" %
          (len(eval_features) / all_results_total * 1000.0))
    print("-----------------------------")

    time_list.sort()

    avg = np.mean(time_list)
    cf_95 = max(time_list[:int(len(time_list) * 0.95)])
    cf_99 = max(time_list[:int(len(time_list) * 0.99)])
    cf_100 = max(time_list[:int(len(time_list) * 1)])
    print("-----------------------------")
    print("Summary Statistics")
    print("Batch size =", FLAGS.predict_batch_size)
    print("Sequence Length =", FLAGS.max_seq_length)
    print("Latency Confidence Level 95 (ms) =", cf_95 * 1000)
    print("Latency Confidence Level 99 (ms)  =", cf_99 * 1000)
    print("Latency Confidence Level 100 (ms)  =", cf_100 * 1000)
    print("Latency Average (ms)  =", avg * 1000)
    print("-----------------------------")

    output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json")
    output_nbest_file = os.path.join(FLAGS.output_dir,
                                     "nbest_predictions.json")
    output_null_log_odds_file = os.path.join(FLAGS.output_dir,
                                             "null_odds.json")

    write_predictions(eval_examples, eval_features, all_results,
                      FLAGS.n_best_size, FLAGS.max_answer_length,
                      FLAGS.do_lower_case, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file)

Exemple #5

0

Afficher le fichier

Fichier : benchmark.py Projet : cake-lab/perseus

    def parse_model(url: str,
                    protocol: ProtocolType,
                    model_name: str,
                    batch_size: int,
                    verbose=False):
        """
        Determines a model's configuration from by interpreting the results of Nvidia's TenorRT Inference Server

        # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
        #
        # Redistribution and use in source and binary forms, with or without
        # modification, are permitted provided that the following conditions
        # are met:
        #  * Redistributions of source code must retain the above copyright
        #    notice, this list of conditions and the following disclaimer.
        #  * Redistributions in binary form must reproduce the above copyright
        #    notice, this list of conditions and the following disclaimer in the
        #    documentation and/or other materials provided with the distribution.
        #  * Neither the name of NVIDIA CORPORATION nor the names of its
        #    contributors may be used to endorse or promote products derived
        #    from this software without specific prior written permission.
        #
        # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
        # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
        # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
        # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
        # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
        # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
        # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
        # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
        # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
        # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
        # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

        @param url: The server's url
        @param protocol: The protocol used to access the server (i.e. gRPC or REST)
        @param model_name: The name of the model
        @param batch_size: The desired model batch size
        @param verbose: If true, prints out the server's response
        @return:
        """
        ctx = ServerStatusContext(url, protocol, model_name, verbose)
        server_status = ctx.get_server_status()

        if model_name not in server_status.model_status:
            raise Exception("unable to get status for '" + model_name + "'")

        status = server_status.model_status[model_name]
        config = status.config

        if len(config.input) != 1:
            raise Exception("expecting 1 input, got {}".format(
                len(config.input)))
        if len(config.output) != 1:
            raise Exception("expecting 1 output, got {}".format(
                len(config.output)))

        input = config.input[0]
        output = config.output[0]

        if output.data_type != model_config.TYPE_FP32:
            raise Exception(
                "expecting output datatype to be TYPE_FP32, model '" +
                model_name + "' output type is " +
                model_config.DataType.Name(output.data_type))

        # Output is expected to be a vector. But allow any number of dimensions as long as all but 1 is size 1
        # (e.g. { 10 }, { 1, 10}, { 10, 1, 1 } are all ok). Variable-size dimensions are not currently supported.
        non_one_cnt = 0
        for dim in output.dims:
            if dim == -1:
                raise Exception(
                    "variable-size dimension in model output not supported")
            if dim > 1:
                non_one_cnt += 1
                if non_one_cnt > 1:
                    raise Exception("expecting model output to be a vector")

        # Model specifying maximum batch size of 0 indicates that batching is not supported and so the input tensors do
        # not expect an "N" dimension (and 'batch_size' should be 1 so that only a single image instance is inferred at
        # a time).
        max_batch_size = config.max_batch_size
        if max_batch_size == 0:
            if batch_size != 1:
                raise Exception("batching not supported for model '" +
                                model_name + "'")
        else:  # max_batch_size > 0
            if batch_size > max_batch_size:
                raise Exception(
                    "expecting batch size <= {} for model {}".format(
                        max_batch_size, model_name))

        # Model input must have 3 dims, either CHW or HWC
        if len(input.dims) != 3:
            raise Exception(
                "expecting input to have 3 dimensions, model '{}' input has {}"
                .format(model_name, len(input.dims)))

        # Variable-size dimensions are not currently supported.
        for dim in input.dims:
            if dim == -1:
                raise Exception(
                    "variable-size dimension in model input not supported")

        if ((input.format != model_config.ModelInput.FORMAT_NCHW)
                and (input.format != model_config.ModelInput.FORMAT_NHWC)):
            raise Exception("unexpected input format " +
                            model_config.ModelInput.Format.Name(input.format) +
                            ", expecting " +
                            model_config.ModelInput.Format.Name(
                                model_config.ModelInput.FORMAT_NCHW) + " or " +
                            model_config.ModelInput.Format.Name(
                                model_config.ModelInput.FORMAT_NHWC))

        if input.format == model_config.ModelInput.FORMAT_NHWC:
            h = input.dims[0]
            w = input.dims[1]
            c = input.dims[2]
        else:
            c = input.dims[0]
            h = input.dims[1]
            w = input.dims[2]

        return input.name, output.name, c, h, w, input.format, ImageBasedModel.model_dtype_to_np(
            input.data_type)

Exemple #6

0

Afficher le fichier

Fichier : run_squad_trtis_client.py Projet : 50gramx/DeepLearningExamples

def run_client():
    """
    Ask a question of context on TRTIS.
    :param context: str
    :param question: str
    :param question_id: int
    :return:
    """

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    # -------------------------------------------------------------
    # Creation of examples here
    # -------------------------------------------------------------
    paragraph = """The koala (Phascolarctos cinereus, or, inaccurately, koala bear[a]) is an arboreal herbivorous marsupial native to Australia. It is the only extant representative of the family Phascolarctidae and its closest living relatives are the wombats, which comprise the family Vombatidae. The koala is found in coastal areas of the mainland's eastern and southern regions, inhabiting Queensland, New South Wales, Victoria, and South Australia. It is easily recognisable by its stout, tailless body and large head with round, fluffy ears and large, spoon-shaped nose. The koala has a body length of 60–85 cm (24–33 in) and weighs 4–15 kg (9–33 lb). Fur colour ranges from silver grey to chocolate brown. Koalas from the northern populations are typically smaller and lighter in colour than their counterparts further south. These populations possibly are separate subspecies, but this is disputed.
    """
    question_text = "Who is Koala?"
    examples = []
    example = SquadExample(
        qas_id=1,
        question_text=question_text,
        doc_tokens=convert_doc_tokens(paragraph_text=paragraph))
    for iterator in range(30):
        examples.append(example)

    # Switching from predict_file read to api-read
    # eval_examples = read_squad_examples(
    #     input_file=FLAGS.predict_file, is_training=False,
    #     version_2_with_negative=FLAGS.version_2_with_negative)
    eval_examples = examples
    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)

    convert_examples_to_features(examples=eval_examples[0:],
                                 tokenizer=tokenizer,
                                 max_seq_length=FLAGS.max_seq_length,
                                 doc_stride=FLAGS.doc_stride,
                                 max_query_length=FLAGS.max_query_length,
                                 is_training=False,
                                 output_fn=append_feature)

    protocol_str = 'grpc'  # http or grpc
    url = FLAGS.trtis_server_url
    verbose = True
    model_name = FLAGS.trtis_model_name
    model_version = FLAGS.trtis_model_version
    batch_size = FLAGS.predict_batch_size

    protocol = ProtocolType.from_str(protocol_str)  # or 'grpc'

    ctx = InferContext(url, protocol, model_name, model_version, verbose)

    channel = grpc.insecure_channel(url)

    stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)

    prof_request = grpc_service_pb2.server__status__pb2.model__config__pb2.ModelConfig(
    )

    prof_response = stub.Profile(prof_request)

    status_ctx = ServerStatusContext(url,
                                     protocol,
                                     model_name=model_name,
                                     verbose=verbose)

    model_config_pb2.ModelConfig()

    status_result = status_ctx.get_server_status()

    outstanding = {}
    max_outstanding = 20

    sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features))
    recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features))

    def process_outstanding(do_wait):

        if (len(outstanding) == 0):
            return

        ready_id = ctx.get_ready_async_request(do_wait)

        if (ready_id is None):
            return

        # If we are here, we got an id
        result = ctx.get_async_run_results(ready_id, False)
        stop = time.time()

        if (result is None):
            raise ValueError(
                "Context returned null for async id marked as done")

        outResult = outstanding.pop(ready_id)

        time_list.append(stop - outResult.start_time)

        batch_count = len(outResult.inputs[label_id_key])

        for i in range(batch_count):
            unique_id = int(outResult.inputs[label_id_key][i][0])
            start_logits = [float(x) for x in result["start_logits"][i].flat]
            end_logits = [float(x) for x in result["end_logits"][i].flat]
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

        recv_prog.update(n=batch_count)

    all_results = []
    time_list = []

    print("Starting Sending Requests....\n")

    all_results_start = time.time()

    for inputs_dict in batch(eval_features, batch_size):

        present_batch_size = len(inputs_dict[label_id_key])

        outputs_dict = {
            'start_logits': InferContext.ResultFormat.RAW,
            'end_logits': InferContext.ResultFormat.RAW
        }

        start = time.time()
        async_id = ctx.async_run(inputs_dict,
                                 outputs_dict,
                                 batch_size=present_batch_size)

        outstanding[async_id] = PendingResult(async_id=async_id,
                                              start_time=start,
                                              inputs=inputs_dict)

        sent_prog.update(n=present_batch_size)

        # Try to process at least one response per request
        process_outstanding(len(outstanding) >= max_outstanding)

    tqdm.tqdm.write(
        "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format(
            len(outstanding)))

    # Now process all outstanding requests
    while (len(outstanding) > 0):
        process_outstanding(True)

    all_results_end = time.time()
    all_results_total = (all_results_end - all_results_start) * 1000.0

    print("-----------------------------")
    print("Individual Time Runs - Ignoring first two iterations")
    print("Total Time: {} ms".format(all_results_total))
    print("-----------------------------")

    print("-----------------------------")
    print("Total Inference Time = %0.2f for"
          "Sentences processed = %d" % (sum(time_list), len(eval_features)))
    print("Throughput Average (sentences/sec) = %0.2f" %
          (len(eval_features) / all_results_total * 1000.0))
    print("-----------------------------")

    time_list.sort()

    avg = np.mean(time_list)
    cf_95 = max(time_list[:int(len(time_list) * 0.95)])
    cf_99 = max(time_list[:int(len(time_list) * 0.99)])
    cf_100 = max(time_list[:int(len(time_list) * 1)])
    print("-----------------------------")
    print("Summary Statistics")
    print("Batch size =", FLAGS.predict_batch_size)
    print("Sequence Length =", FLAGS.max_seq_length)
    print("Latency Confidence Level 95 (ms) =", cf_95 * 1000)
    print("Latency Confidence Level 99 (ms)  =", cf_99 * 1000)
    print("Latency Confidence Level 100 (ms)  =", cf_100 * 1000)
    print("Latency Average (ms)  =", avg * 1000)
    print("-----------------------------")

Exemple #7

0

Afficher le fichier

def parse_model(url, protocol, model_name, batch_size, verbose=False):
    """
    Check the configuration of a model to make sure it meets the
    requirements for an image classification network (as expected by
    this client)
    """
    ctx = ServerStatusContext(url, protocol, model_name, verbose)
    server_status: ServerStatus = ctx.get_server_status()

    if model_name not in server_status.model_status:
        raise Exception("unable to get status for '" + model_name + "'")

    status: ModelStatus = server_status.model_status[model_name]
    config = status.config

    if len(config.input) != 1:
        raise Exception("expecting 1 input, got {}".format(len(config.input)))
    if len(config.output) != 1:
        raise Exception("expecting 1 output, got {}".format(len(
            config.output)))

    input = config.input[0]
    output = config.output[0]

    if output.data_type != model_config.TYPE_FP32:
        raise Exception("expecting output datatype to be TYPE_FP32, model '" +
                        model_name + "' output type is " +
                        model_config.DataType.Name(output.data_type))

    # Output is expected to be a vector. But allow any number of
    # dimensions as long as all but 1 is size 1 (e.g. { 10 }, { 1, 10
    # }, { 10, 1, 1 } are all ok). Variable-size dimensions are not
    # currently supported.
    non_one_cnt = 0
    for dim in output.dims:
        if dim == -1:
            raise Exception(
                "variable-size dimension in model output not supported")
        if dim > 1:
            non_one_cnt += 1
            if non_one_cnt > 1:
                raise Exception("expecting model output to be a vector")

    # Model specifying maximum batch size of 0 indicates that batching
    # is not supported and so the input tensors do not expect an "N"
    # dimension (and 'batch_size' should be 1 so that only a single
    # image instance is inferred at a time).
    max_batch_size = config.max_batch_size
    if max_batch_size == 0:
        if batch_size != 1:
            raise Exception("batching not supported for model '" + model_name +
                            "'")
    else:  # max_batch_size > 0
        if batch_size > max_batch_size:
            raise Exception("expecting batch size <= {} for model {}".format(
                max_batch_size, model_name))

    # Model input must have 3 dims, either CHW or HWC
    if len(input.dims) != 3:
        raise Exception(
            "expecting input to have 3 dimensions, model '{}' input has {}".
            format(model_name, len(input.dims)))

    # Variable-size dimensions are not currently supported.
    for dim in input.dims:
        if dim == -1:
            raise Exception(
                "variable-size dimension in model input not supported")

    if ((input.format != model_config.ModelInput.FORMAT_NCHW)
            and (input.format != model_config.ModelInput.FORMAT_NHWC)):
        raise Exception("unexpected input format " +
                        model_config.ModelInput.Format.Name(input.format) +
                        ", expecting " + model_config.ModelInput.Format.Name(
                            model_config.ModelInput.FORMAT_NCHW) + " or " +
                        model_config.ModelInput.Format.Name(
                            model_config.ModelInput.FORMAT_NHWC))

    if input.format == model_config.ModelInput.FORMAT_NHWC:
        h = input.dims[0]
        w = input.dims[1]
        c = input.dims[2]
    else:
        c = input.dims[0]
        h = input.dims[1]
        w = input.dims[2]

    return input.name, output.name, c, h, w, input.format, model_data_type_to_np(
        input.data_type)

Exemple #8

0

Afficher le fichier

def main(_):
    """
    Ask a question of context on Triton.
    :param context: str
    :param question: str
    :param question_id: int
    :return:
    """
    os.environ[
        "TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false"  #causes memory fragmentation for bert leading to OOM

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    # Get the Data
    if FLAGS.predict_file:
        eval_examples = read_squad_examples(
            input_file=FLAGS.predict_file,
            is_training=False,
            version_2_with_negative=FLAGS.version_2_with_negative)
    elif FLAGS.question and FLAGS.answer:
        input_data = [{
            "paragraphs": [{
                "context": FLAGS.context,
                "qas": [{
                    "id": 0,
                    "question": FLAGS.question
                }]
            }]
        }]

        eval_examples = read_squad_examples(
            input_file=None,
            is_training=False,
            version_2_with_negative=FLAGS.version_2_with_negative,
            input_data=input_data)
    else:
        raise ValueError(
            "Either predict_file or question+answer need to defined")

    # Get Eval Features = Preprocessing
    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)

    convert_examples_to_features(examples=eval_examples[0:],
                                 tokenizer=tokenizer,
                                 max_seq_length=FLAGS.max_seq_length,
                                 doc_stride=FLAGS.doc_stride,
                                 max_query_length=FLAGS.max_query_length,
                                 is_training=False,
                                 output_fn=append_feature)

    protocol_str = 'grpc'  # http or grpc
    url = FLAGS.triton_server_url
    verbose = True
    model_name = FLAGS.triton_model_name
    model_version = FLAGS.triton_model_version
    batch_size = FLAGS.predict_batch_size

    protocol = ProtocolType.from_str(protocol_str)  # or 'grpc'

    ctx = InferContext(url, protocol, model_name, model_version, verbose)

    status_ctx = ServerStatusContext(url,
                                     protocol,
                                     model_name=model_name,
                                     verbose=verbose)

    model_config_pb2.ModelConfig()

    status_result = status_ctx.get_server_status()
    user_data = UserData()

    max_outstanding = 20
    # Number of outstanding requests
    outstanding = 0

    sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features))
    recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features))

    def process_outstanding(do_wait, outstanding):

        if (outstanding == 0 or do_wait is False):
            return outstanding

        # Wait for deferred items from callback functions
        (infer_ctx, ready_id, idx, start_time,
         inputs) = user_data._completed_requests.get()

        if (ready_id is None):
            return outstanding

        # If we are here, we got an id
        result = ctx.get_async_run_results(ready_id)
        stop = time.time()

        if (result is None):
            raise ValueError(
                "Context returned null for async id marked as done")

        outstanding -= 1

        time_list.append(stop - start_time)

        batch_count = len(inputs[label_id_key])

        for i in range(batch_count):
            unique_id = int(inputs[label_id_key][i][0])
            start_logits = [float(x) for x in result["start_logits"][i].flat]
            end_logits = [float(x) for x in result["end_logits"][i].flat]
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

        recv_prog.update(n=batch_count)
        return outstanding

    all_results = []
    time_list = []

    print("Starting Sending Requests....\n")

    all_results_start = time.time()
    idx = 0
    for inputs_dict in batch(eval_features, batch_size):

        present_batch_size = len(inputs_dict[label_id_key])

        outputs_dict = {
            'start_logits': InferContext.ResultFormat.RAW,
            'end_logits': InferContext.ResultFormat.RAW
        }

        start_time = time.time()
        ctx.async_run(partial(completion_callback, user_data, idx, start_time,
                              inputs_dict),
                      inputs_dict,
                      outputs_dict,
                      batch_size=present_batch_size)
        outstanding += 1
        idx += 1

        sent_prog.update(n=present_batch_size)

        # Try to process at least one response per request
        outstanding = process_outstanding(outstanding >= max_outstanding,
                                          outstanding)

    tqdm.tqdm.write(
        "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format(
            outstanding))

    # Now process all outstanding requests
    while (outstanding > 0):
        outstanding = process_outstanding(True, outstanding)

    all_results_end = time.time()
    all_results_total = (all_results_end - all_results_start) * 1000.0

    print("-----------------------------")
    print("Total Time: {} ms".format(all_results_total))
    print("-----------------------------")

    print("-----------------------------")
    print("Total Inference Time = %0.2f for"
          "Sentences processed = %d" % (sum(time_list), len(eval_features)))
    print("Throughput Average (sentences/sec) = %0.2f" %
          (len(eval_features) / all_results_total * 1000.0))
    print("-----------------------------")

    if FLAGS.output_dir and FLAGS.predict_file:
        # When inferencing on a dataset, get inference statistics and write results to json file
        time_list.sort()

        avg = np.mean(time_list)
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        print("-----------------------------")
        print("Summary Statistics")
        print("Batch size =", FLAGS.predict_batch_size)
        print("Sequence Length =", FLAGS.max_seq_length)
        print("Latency Confidence Level 95 (ms) =", cf_95 * 1000)
        print("Latency Confidence Level 99 (ms)  =", cf_99 * 1000)
        print("Latency Confidence Level 100 (ms)  =", cf_100 * 1000)
        print("Latency Average (ms)  =", avg * 1000)
        print("-----------------------------")

        output_prediction_file = os.path.join(FLAGS.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(FLAGS.output_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(FLAGS.output_dir,
                                                 "null_odds.json")

        write_predictions(eval_examples, eval_features, all_results,
                          FLAGS.n_best_size, FLAGS.max_answer_length,
                          FLAGS.do_lower_case, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file,
                          FLAGS.version_2_with_negative, FLAGS.verbose_logging)
    else:
        # When inferencing on a single example, write best answer to stdout
        all_predictions, all_nbest_json, scores_diff_json = get_predictions(
            eval_examples, eval_features, all_results, FLAGS.n_best_size,
            FLAGS.max_answer_length, FLAGS.do_lower_case,
            FLAGS.version_2_with_negative, FLAGS.verbose_logging)
        print(
            "Context is: %s \n\nQuestion is: %s \n\nPredicted Answer is: %s" %
            (FLAGS.context, FLAGS.question, all_predictions[0]))