def TestGRPC(self):
        channel = grpc.insecure_channel(self.url)
        grpc_stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)

        request_ = self.requestGenerator("DUMMY", FLAGS)
        # Send request
        response_ = grpc_stub.Infer(request_)
        return response_.request_status.code == 5
Esempio n. 2
0
def get_prediction(image_filename,
                   server_host='localhost',
                   server_port=8001,
                   model_name="bolt",
                   model_version=None):
    """
  Retrieve a prediction from a TensorFlow model server

  :param image:       a bolt image
  :param server_host: the address of the TensorRT inference server
  :param server_port: the port used by the server
  :param model_name: the name of the model
  :param timeout:     the amount of time to wait for a prediction to complete
  :return 0:          the integer predicted in the bolt image
  :return 1:          the confidence scores for all classes
  """
    channel = grpc.insecure_channel(server_host + ':' + str(server_port))
    grpc_stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)

    # Prepare request for Status gRPC
    request = grpc_service_pb2.StatusRequest(model_name=model_name)
    # Call and receive response from Status gRPC
    response = grpc_stub.Status(request)
    # Make sure the model matches our requirements, and get some
    # properties of the model that we need for preprocessing
    batch_size = 1
    verbose = False
    input_name, output_name, c, h, w, format, dtype = parse_model(
        response, model_name, batch_size, verbose)

    filledRequestGenerator = partial(requestGenerator, input_name, output_name,
                                     c, h, w, format, dtype, model_name,
                                     model_version, image_filename)

    # Send requests of batch_size images. If the number of
    # images isn't an exact multiple of batch_size then just
    # start over with the first images until the batch is filled.
    result_filenames = []
    requests = []
    responses = []

    # Send request
    for request in filledRequestGenerator(result_filenames):
        responses.append(grpc_stub.Infer(request))

    # For async, retrieve results according to the send order
    for request in requests:
        responses.append(request.result())

    idx = 0
    for response in responses:
        print("Request {}, batch size {}".format(idx, batch_size))
        label, score = postprocess(response.meta_data.output,
                                   result_filenames[idx], batch_size)
        idx += 1

    return label, score
def main():
    hostport = "{}:8001".format(SERVER_ADDRESS)
    channel = grpc.insecure_channel(hostport)
    stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)
    request = init_request(MODEL_NAME, DATA_DIR, DATA_SIZE)

    while (True):
        request_per_sec = int(os.environ['REQUEST_PER_SEC'])
        result = stub.Infer(request, TIMEOUT_MSEC)
        time.sleep(1. / request_per_sec)
Esempio n. 4
0
def init_tensorrt_connection(url):
    '''
    Iniitlaize connection to TensorRT Server using gRPC
    Arguments :
        url : gRPC TensorRT Server URL (host:port)
    Returns :
        grpc_stub
    '''
    channel = grpc.insecure_channel(url, options=[('grpc.max_receive_message_length', 7000000)])
    grpc_stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)
    return grpc_stub
    def run(self, input):
        channel = grpc.insecure_channel(self.url)
        grpc_stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)

        request = self.build_request(input)
        response = grpc_stub.Infer(request)

        print(response.request_status.msg)
        print(response)
        results = {}
        for meta, raw_output in zip(self.model.output, response.raw_output):
            data_type = DTYPE[meta.data_type]
            print(raw_output)
            np_data = np.frombuffer(raw_output, dtype=data_type)
            results[meta.name] = np_data

        return results
Esempio n. 6
0
    def fashion_matching_request(self, img):
        """
        Sends request to the running model at the tensorrt inference server and returns the response.
        
        Arguments:
        img(PIL.Image) : Input image.
        
        Returns:
        reco_res(list) : 10 nearest neighbours of the test image as python list.
        """
        with grpc.insecure_channel(self._URL) as channel:
            grpc_stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)
            request = self.prepare_request(img)
            response = grpc_stub.Infer(request)
            im_emb = np.frombuffer(response.raw_output[0], np.float32)
            reco_res = self.fashion_matching_postprocess(im_emb)

        return reco_res
Esempio n. 7
0
    def classification_request(self, img):
        """
        Sends request to the running model at the tensorrt inference server and returns the response.
        
        Arguments:
        img(PIL.Image) : Input image.
        
        Returns:
        index(int) : Index of the predicted class.
        """
        with grpc.insecure_channel(self._URL) as channel:
            grpc_stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)
            request = self.prepare_request(img)
            response = grpc_stub.Infer(request)
            result = response.raw_output[0]
            res_arr = np.frombuffer(result, dtype=np.float32)
            res_arr = np.reshape(res_arr, (1000, ))
            index = np.argmax(utils.Utils.softmax(self, res_arr))

            return index
        '-u',
        '--url',
        type=str,
        required=False,
        default='localhost:8001',
        help='Inference server URL. Default is localhost:8001.')
    parser.add_argument('image_filename',
                        type=str,
                        nargs='?',
                        default=None,
                        help='Input image.')
    FLAGS = parser.parse_args()

    # Create gRPC stub for communicating with the server
    channel = grpc.insecure_channel(FLAGS.url)
    grpc_stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)

    # Prepare request for Status gRPC
    request = grpc_service_pb2.StatusRequest(model_name=FLAGS.model_name)
    # Call and receive response from Status gRPC
    response = grpc_stub.Status(request)
    # Make sure the model matches our requirements, and get some
    # properties of the model that we need for preprocessing
    input_name, output_name, c, h, w, format, dtype = parse_model(
        response, FLAGS.model_name, FLAGS.batch_size, FLAGS.verbose)

    filledRequestGenerator = partial(requestGenerator, input_name, output_name,
                                     c, h, w, format, dtype, FLAGS)

    # Send requests of FLAGS.batch_size images. If the number of
    # images isn't an exact multiple of FLAGS.batch_size then just
def run_client():
    """
    Ask a question of context on TRTIS.
    :param context: str
    :param question: str
    :param question_id: int
    :return:
    """

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    eval_examples = read_squad_examples(
        input_file=FLAGS.predict_file,
        is_training=False,
        version_2_with_negative=FLAGS.version_2_with_negative)

    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)

    convert_examples_to_features(examples=eval_examples[0:],
                                 tokenizer=tokenizer,
                                 max_seq_length=FLAGS.max_seq_length,
                                 doc_stride=FLAGS.doc_stride,
                                 max_query_length=FLAGS.max_query_length,
                                 is_training=False,
                                 output_fn=append_feature)

    protocol_str = 'grpc'  # http or grpc
    url = FLAGS.trtis_server_url
    verbose = True
    model_name = FLAGS.trtis_model_name
    model_version = FLAGS.trtis_model_version
    batch_size = FLAGS.predict_batch_size

    protocol = ProtocolType.from_str(protocol_str)  # or 'grpc'

    ctx = InferContext(url, protocol, model_name, model_version, verbose)

    channel = grpc.insecure_channel(url)

    stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)

    prof_request = grpc_service_pb2.server__status__pb2.model__config__pb2.ModelConfig(
    )

    prof_response = stub.Profile(prof_request)

    status_ctx = ServerStatusContext(url,
                                     protocol,
                                     model_name=model_name,
                                     verbose=verbose)

    model_config_pb2.ModelConfig()

    status_result = status_ctx.get_server_status()

    outstanding = {}
    max_outstanding = 20

    sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features))
    recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features))

    def process_outstanding(do_wait):

        if (len(outstanding) == 0):
            return

        ready_id = ctx.get_ready_async_request(do_wait)

        if (ready_id is None):
            return

        # If we are here, we got an id
        result = ctx.get_async_run_results(ready_id, False)
        stop = time.time()

        if (result is None):
            raise ValueError(
                "Context returned null for async id marked as done")

        outResult = outstanding.pop(ready_id)

        time_list.append(stop - outResult.start_time)

        batch_count = len(outResult.inputs[label_id_key])

        for i in range(batch_count):
            unique_id = int(outResult.inputs[label_id_key][i][0])
            start_logits = [float(x) for x in result["start_logits"][i].flat]
            end_logits = [float(x) for x in result["end_logits"][i].flat]
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

        recv_prog.update(n=batch_count)

    all_results = []
    time_list = []

    print("Starting Sending Requests....\n")

    all_results_start = time.time()

    for inputs_dict in batch(eval_features, batch_size):

        present_batch_size = len(inputs_dict[label_id_key])

        outputs_dict = {
            'start_logits': InferContext.ResultFormat.RAW,
            'end_logits': InferContext.ResultFormat.RAW
        }

        start = time.time()
        async_id = ctx.async_run(inputs_dict,
                                 outputs_dict,
                                 batch_size=present_batch_size)

        outstanding[async_id] = PendingResult(async_id=async_id,
                                              start_time=start,
                                              inputs=inputs_dict)

        sent_prog.update(n=present_batch_size)

        # Try to process at least one response per request
        process_outstanding(len(outstanding) >= max_outstanding)

    tqdm.tqdm.write(
        "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format(
            len(outstanding)))

    # Now process all outstanding requests
    while (len(outstanding) > 0):
        process_outstanding(True)

    all_results_end = time.time()
    all_results_total = (all_results_end - all_results_start) * 1000.0

    print("-----------------------------")
    print("Individual Time Runs - Ignoring first two iterations")
    print("Total Time: {} ms".format(all_results_total))
    print("-----------------------------")

    print("-----------------------------")
    print("Total Inference Time = %0.2f for"
          "Sentences processed = %d" % (sum(time_list), len(eval_features)))
    print("Throughput Average (sentences/sec) = %0.2f" %
          (len(eval_features) / all_results_total * 1000.0))
    print("-----------------------------")

    time_list.sort()

    avg = np.mean(time_list)
    cf_95 = max(time_list[:int(len(time_list) * 0.95)])
    cf_99 = max(time_list[:int(len(time_list) * 0.99)])
    cf_100 = max(time_list[:int(len(time_list) * 1)])
    print("-----------------------------")
    print("Summary Statistics")
    print("Batch size =", FLAGS.predict_batch_size)
    print("Sequence Length =", FLAGS.max_seq_length)
    print("Latency Confidence Level 95 (ms) =", cf_95 * 1000)
    print("Latency Confidence Level 99 (ms)  =", cf_99 * 1000)
    print("Latency Confidence Level 100 (ms)  =", cf_100 * 1000)
    print("Latency Average (ms)  =", avg * 1000)
    print("-----------------------------")

    output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json")
    output_nbest_file = os.path.join(FLAGS.output_dir,
                                     "nbest_predictions.json")
    output_null_log_odds_file = os.path.join(FLAGS.output_dir,
                                             "null_odds.json")

    write_predictions(eval_examples, eval_features, all_results,
                      FLAGS.n_best_size, FLAGS.max_answer_length,
                      FLAGS.do_lower_case, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file)
def get_prediction(image_filename,
                   server_host='localhost',
                   server_port=8001,
                   model_name="end2end-demo",
                   model_version=None):
    """
  Retrieve a prediction from a TensorFlow model server

  :param image:       a end2end-demo image
  :param server_host: the address of the TensorRT inference server
  :param server_port: the port used by the server
  :param model_name: the name of the model
  :param timeout:     the amount of time to wait for a prediction to complete
  :return 0:          the integer predicted in the end2end-demo image
  :return 1:          the confidence scores for all classes
  """
    channel = grpc.insecure_channel(server_host + ':' + str(server_port))
    grpc_stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)

    # https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-guide/docs/http_grpc_api.html#section-api-status
    # Prepare request for Status gRPC
    request = grpc_service_pb2.StatusRequest(model_name=model_name)
    # Call and receive response from Status gRPC
    response = grpc_stub.Status(request)
    print('response:', response)
    # Make sure the model matches our requirements, and get some
    # properties of the model that we need for preprocessing
    batch_size = 1
    verbose = False
    input_name, output_name, c, h, w, format, dtype = parse_model(
        response, model_name, batch_size, verbose)
    logging.info("Got status for model %s:", model_name)
    print("Got status for model" + model_name)
    status_full = "input_name=%s, output_name=%s, c=%s, h=%s, w=%s, format=%s, dtype=%s " % (
        input_name, output_name, c, h, w, format, dtype)
    logging.info(status_full)
    print(status_full)

    filledRequestGenerator = partial(requestGenerator, input_name, output_name,
                                     c, h, w, format, dtype, model_name,
                                     model_version, image_filename)

    # Send requests of batch_size images. If the number of
    # images isn't an exact multiple of batch_size then just
    # start over with the first images until the batch is filled.
    result_filenames = []
    requests = []
    responses = []

    # Send request
    for request in filledRequestGenerator(result_filenames):
        responses.append(grpc_stub.Infer(request))

    # For async, retrieve results according to the send order
    for request in requests:
        responses.append(request.result())
    # print("responses={}".format(responses))

    idx = 0
    logging.info('responses size: %d', len(responses))
    for response in responses:
        print("Request {}, batch size {}".format(idx, batch_size))
        print("result_filenames={}".format(result_filenames))
        print("response={}".format(response))
        count = len(response.meta_data.output)
        if count != 1:
            err = "expected 1 result, got {}:{}".format(
                count, response.request_status.msg)
            raise Exception(err)
        label, score = postprocess(response.meta_data.output,
                                   result_filenames[idx], batch_size)
        idx += 1

    return label, score
def run_client():
    """
    Ask a question of context on TRTIS.
    :param context: str
    :param question: str
    :param question_id: int
    :return:
    """

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    # -------------------------------------------------------------
    # Creation of examples here
    # -------------------------------------------------------------
    paragraph = """The koala (Phascolarctos cinereus, or, inaccurately, koala bear[a]) is an arboreal herbivorous marsupial native to Australia. It is the only extant representative of the family Phascolarctidae and its closest living relatives are the wombats, which comprise the family Vombatidae. The koala is found in coastal areas of the mainland's eastern and southern regions, inhabiting Queensland, New South Wales, Victoria, and South Australia. It is easily recognisable by its stout, tailless body and large head with round, fluffy ears and large, spoon-shaped nose. The koala has a body length of 60–85 cm (24–33 in) and weighs 4–15 kg (9–33 lb). Fur colour ranges from silver grey to chocolate brown. Koalas from the northern populations are typically smaller and lighter in colour than their counterparts further south. These populations possibly are separate subspecies, but this is disputed.
    """
    question_text = "Who is Koala?"
    examples = []
    example = SquadExample(
        qas_id=1,
        question_text=question_text,
        doc_tokens=convert_doc_tokens(paragraph_text=paragraph))
    for iterator in range(30):
        examples.append(example)

    # Switching from predict_file read to api-read
    # eval_examples = read_squad_examples(
    #     input_file=FLAGS.predict_file, is_training=False,
    #     version_2_with_negative=FLAGS.version_2_with_negative)
    eval_examples = examples
    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)

    convert_examples_to_features(examples=eval_examples[0:],
                                 tokenizer=tokenizer,
                                 max_seq_length=FLAGS.max_seq_length,
                                 doc_stride=FLAGS.doc_stride,
                                 max_query_length=FLAGS.max_query_length,
                                 is_training=False,
                                 output_fn=append_feature)

    protocol_str = 'grpc'  # http or grpc
    url = FLAGS.trtis_server_url
    verbose = True
    model_name = FLAGS.trtis_model_name
    model_version = FLAGS.trtis_model_version
    batch_size = FLAGS.predict_batch_size

    protocol = ProtocolType.from_str(protocol_str)  # or 'grpc'

    ctx = InferContext(url, protocol, model_name, model_version, verbose)

    channel = grpc.insecure_channel(url)

    stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)

    prof_request = grpc_service_pb2.server__status__pb2.model__config__pb2.ModelConfig(
    )

    prof_response = stub.Profile(prof_request)

    status_ctx = ServerStatusContext(url,
                                     protocol,
                                     model_name=model_name,
                                     verbose=verbose)

    model_config_pb2.ModelConfig()

    status_result = status_ctx.get_server_status()

    outstanding = {}
    max_outstanding = 20

    sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features))
    recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features))

    def process_outstanding(do_wait):

        if (len(outstanding) == 0):
            return

        ready_id = ctx.get_ready_async_request(do_wait)

        if (ready_id is None):
            return

        # If we are here, we got an id
        result = ctx.get_async_run_results(ready_id, False)
        stop = time.time()

        if (result is None):
            raise ValueError(
                "Context returned null for async id marked as done")

        outResult = outstanding.pop(ready_id)

        time_list.append(stop - outResult.start_time)

        batch_count = len(outResult.inputs[label_id_key])

        for i in range(batch_count):
            unique_id = int(outResult.inputs[label_id_key][i][0])
            start_logits = [float(x) for x in result["start_logits"][i].flat]
            end_logits = [float(x) for x in result["end_logits"][i].flat]
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

        recv_prog.update(n=batch_count)

    all_results = []
    time_list = []

    print("Starting Sending Requests....\n")

    all_results_start = time.time()

    for inputs_dict in batch(eval_features, batch_size):

        present_batch_size = len(inputs_dict[label_id_key])

        outputs_dict = {
            'start_logits': InferContext.ResultFormat.RAW,
            'end_logits': InferContext.ResultFormat.RAW
        }

        start = time.time()
        async_id = ctx.async_run(inputs_dict,
                                 outputs_dict,
                                 batch_size=present_batch_size)

        outstanding[async_id] = PendingResult(async_id=async_id,
                                              start_time=start,
                                              inputs=inputs_dict)

        sent_prog.update(n=present_batch_size)

        # Try to process at least one response per request
        process_outstanding(len(outstanding) >= max_outstanding)

    tqdm.tqdm.write(
        "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format(
            len(outstanding)))

    # Now process all outstanding requests
    while (len(outstanding) > 0):
        process_outstanding(True)

    all_results_end = time.time()
    all_results_total = (all_results_end - all_results_start) * 1000.0

    print("-----------------------------")
    print("Individual Time Runs - Ignoring first two iterations")
    print("Total Time: {} ms".format(all_results_total))
    print("-----------------------------")

    print("-----------------------------")
    print("Total Inference Time = %0.2f for"
          "Sentences processed = %d" % (sum(time_list), len(eval_features)))
    print("Throughput Average (sentences/sec) = %0.2f" %
          (len(eval_features) / all_results_total * 1000.0))
    print("-----------------------------")

    time_list.sort()

    avg = np.mean(time_list)
    cf_95 = max(time_list[:int(len(time_list) * 0.95)])
    cf_99 = max(time_list[:int(len(time_list) * 0.99)])
    cf_100 = max(time_list[:int(len(time_list) * 1)])
    print("-----------------------------")
    print("Summary Statistics")
    print("Batch size =", FLAGS.predict_batch_size)
    print("Sequence Length =", FLAGS.max_seq_length)
    print("Latency Confidence Level 95 (ms) =", cf_95 * 1000)
    print("Latency Confidence Level 99 (ms)  =", cf_99 * 1000)
    print("Latency Confidence Level 100 (ms)  =", cf_100 * 1000)
    print("Latency Average (ms)  =", avg * 1000)
    print("-----------------------------")
 def new_connection(self):
     server_address = "{}:8001".format(self.host)
     self.channel = grpc.insecure_channel(server_address)
     self.stub = grpc_service_pb2_grpc.GRPCServiceStub(self.channel)