Ejemplo n.º 1
0
    def predict(self, features: Dict) -> Dict:
        if not self.infer_ctx:
            self.infer_ctx = InferContext(self.predictor_host,
                                          self.protocol,
                                          self.model_name,
                                          self.model_version,
                                          http_headers='',
                                          verbose=True)

        batch_size = 1
        unique_ids = np.int32([1])
        segment_ids = features["segment_ids"]
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        result = self.infer_ctx.run(
            {
                'unique_ids': (unique_ids, ),
                'segment_ids': (segment_ids, ),
                'input_ids': (input_ids, ),
                'input_mask': (input_mask, )
            }, {
                'end_logits': InferContext.ResultFormat.RAW,
                'start_logits': InferContext.ResultFormat.RAW
            }, batch_size)
        return result
Ejemplo n.º 2
0
 def infer(self, input_batch):
     ctx = InferContext(f"localhost:{TRT_GRPC_PORT}",
                        ProtocolType.from_str('gRPC'), 'ResNet50', -1,
                        False, 0, False)
     ctx.run({self.input_name: input_batch},
             {self.output_name:
              (InferContext.ResultFormat.CLASS, 1)}, self.batch_size)
Ejemplo n.º 3
0
 def close(self):
     """Delete any held correlation_ids, and then close the context. 
     Any future calls to object will result in an Error.
     """
     for id in self.correlation_ids():
         self.delete(id)
     # make it work with both 2 and 3 as InferContext does not
     # inherit from object, so super in broken in 2.
     InferContext.close(self)
Ejemplo n.º 4
0
 def __init__(self, url, model_name='cidmgr', model_version=-1,
              verbose=False, correlation_id=1, streaming=False):
     protocol = ProtocolType.from_str("grpc")
     self._id_registry = set()
     # make it work with both 2 and 3 as InferContext does not
     # inherit from object, so super in broken in 2.
     InferContext.__init__(self,
         url, protocol, model_name, model_version, 
         verbose, correlation_id, streaming)
Ejemplo n.º 5
0
 def __init__(self, url, model_name, model_version=None,
              verbose=False, cidmgr_context=None, streaming=False):
     if not cidmgr_context:
         cidmgr_context = CIDMgrContext(url, verbose=verbose)
     self.cidmgr=cidmgr_context
     protocol = ProtocolType.from_str("grpc")
     correlation_id = self.cidmgr.new()
     # make it work with both 2 and 3 as InferContext does not
     # inherit from object, so super in broken in 2.
     InferContext.__init__(self,
         url, protocol, model_name, model_version, 
         verbose, correlation_id, streaming)
Ejemplo n.º 6
0
    def infer(flags, input_name, output_name, batch):
        """
        Perform inference non-asynchronosly
        @param flags: The server flags
        @param input_name: The name of the input field
        @param output_name: The name of the output field
        @param batch: The batch of images to send
        @return: The elapsed time for perfroming an inference request
        """
        inference_context = InferContext(
            (flags.url + ":" + str(flags.port)), flags.protocol,
            flags.model_name, flags.model_version, False, 0, flags.streaming,
            [])

        start_time = time()

        inference_context.run(
            {input_name: batch},
            {output_name: (InferContext.ResultFormat.CLASS, flags.classes)},
            flags.batch_size)

        end_time = time()

        return end_time - start_time
def run_client():
    """
    Ask a question of context on TRTIS.
    :param context: str
    :param question: str
    :param question_id: int
    :return:
    """

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    eval_examples = read_squad_examples(
        input_file=FLAGS.predict_file,
        is_training=False,
        version_2_with_negative=FLAGS.version_2_with_negative)

    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)

    convert_examples_to_features(examples=eval_examples[0:],
                                 tokenizer=tokenizer,
                                 max_seq_length=FLAGS.max_seq_length,
                                 doc_stride=FLAGS.doc_stride,
                                 max_query_length=FLAGS.max_query_length,
                                 is_training=False,
                                 output_fn=append_feature)

    protocol_str = 'grpc'  # http or grpc
    url = FLAGS.trtis_server_url
    verbose = True
    model_name = FLAGS.trtis_model_name
    model_version = FLAGS.trtis_model_version
    batch_size = FLAGS.predict_batch_size

    protocol = ProtocolType.from_str(protocol_str)  # or 'grpc'

    ctx = InferContext(url, protocol, model_name, model_version, verbose)

    channel = grpc.insecure_channel(url)

    stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)

    prof_request = grpc_service_pb2.server__status__pb2.model__config__pb2.ModelConfig(
    )

    prof_response = stub.Profile(prof_request)

    status_ctx = ServerStatusContext(url,
                                     protocol,
                                     model_name=model_name,
                                     verbose=verbose)

    model_config_pb2.ModelConfig()

    status_result = status_ctx.get_server_status()

    outstanding = {}
    max_outstanding = 20

    sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features))
    recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features))

    def process_outstanding(do_wait):

        if (len(outstanding) == 0):
            return

        ready_id = ctx.get_ready_async_request(do_wait)

        if (ready_id is None):
            return

        # If we are here, we got an id
        result = ctx.get_async_run_results(ready_id, False)
        stop = time.time()

        if (result is None):
            raise ValueError(
                "Context returned null for async id marked as done")

        outResult = outstanding.pop(ready_id)

        time_list.append(stop - outResult.start_time)

        batch_count = len(outResult.inputs[label_id_key])

        for i in range(batch_count):
            unique_id = int(outResult.inputs[label_id_key][i][0])
            start_logits = [float(x) for x in result["start_logits"][i].flat]
            end_logits = [float(x) for x in result["end_logits"][i].flat]
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

        recv_prog.update(n=batch_count)

    all_results = []
    time_list = []

    print("Starting Sending Requests....\n")

    all_results_start = time.time()

    for inputs_dict in batch(eval_features, batch_size):

        present_batch_size = len(inputs_dict[label_id_key])

        outputs_dict = {
            'start_logits': InferContext.ResultFormat.RAW,
            'end_logits': InferContext.ResultFormat.RAW
        }

        start = time.time()
        async_id = ctx.async_run(inputs_dict,
                                 outputs_dict,
                                 batch_size=present_batch_size)

        outstanding[async_id] = PendingResult(async_id=async_id,
                                              start_time=start,
                                              inputs=inputs_dict)

        sent_prog.update(n=present_batch_size)

        # Try to process at least one response per request
        process_outstanding(len(outstanding) >= max_outstanding)

    tqdm.tqdm.write(
        "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format(
            len(outstanding)))

    # Now process all outstanding requests
    while (len(outstanding) > 0):
        process_outstanding(True)

    all_results_end = time.time()
    all_results_total = (all_results_end - all_results_start) * 1000.0

    print("-----------------------------")
    print("Individual Time Runs - Ignoring first two iterations")
    print("Total Time: {} ms".format(all_results_total))
    print("-----------------------------")

    print("-----------------------------")
    print("Total Inference Time = %0.2f for"
          "Sentences processed = %d" % (sum(time_list), len(eval_features)))
    print("Throughput Average (sentences/sec) = %0.2f" %
          (len(eval_features) / all_results_total * 1000.0))
    print("-----------------------------")

    time_list.sort()

    avg = np.mean(time_list)
    cf_95 = max(time_list[:int(len(time_list) * 0.95)])
    cf_99 = max(time_list[:int(len(time_list) * 0.99)])
    cf_100 = max(time_list[:int(len(time_list) * 1)])
    print("-----------------------------")
    print("Summary Statistics")
    print("Batch size =", FLAGS.predict_batch_size)
    print("Sequence Length =", FLAGS.max_seq_length)
    print("Latency Confidence Level 95 (ms) =", cf_95 * 1000)
    print("Latency Confidence Level 99 (ms)  =", cf_99 * 1000)
    print("Latency Confidence Level 100 (ms)  =", cf_100 * 1000)
    print("Latency Average (ms)  =", avg * 1000)
    print("-----------------------------")

    output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json")
    output_nbest_file = os.path.join(FLAGS.output_dir,
                                     "nbest_predictions.json")
    output_null_log_odds_file = os.path.join(FLAGS.output_dir,
                                             "null_odds.json")

    write_predictions(eval_examples, eval_features, all_results,
                      FLAGS.n_best_size, FLAGS.max_answer_length,
                      FLAGS.do_lower_case, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file)
def run_client():
    """
    Ask a question of context on TRTIS.
    :param context: str
    :param question: str
    :param question_id: int
    :return:
    """

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    # -------------------------------------------------------------
    # Creation of examples here
    # -------------------------------------------------------------
    paragraph = """The koala (Phascolarctos cinereus, or, inaccurately, koala bear[a]) is an arboreal herbivorous marsupial native to Australia. It is the only extant representative of the family Phascolarctidae and its closest living relatives are the wombats, which comprise the family Vombatidae. The koala is found in coastal areas of the mainland's eastern and southern regions, inhabiting Queensland, New South Wales, Victoria, and South Australia. It is easily recognisable by its stout, tailless body and large head with round, fluffy ears and large, spoon-shaped nose. The koala has a body length of 60–85 cm (24–33 in) and weighs 4–15 kg (9–33 lb). Fur colour ranges from silver grey to chocolate brown. Koalas from the northern populations are typically smaller and lighter in colour than their counterparts further south. These populations possibly are separate subspecies, but this is disputed.
    """
    question_text = "Who is Koala?"
    examples = []
    example = SquadExample(
        qas_id=1,
        question_text=question_text,
        doc_tokens=convert_doc_tokens(paragraph_text=paragraph))
    for iterator in range(30):
        examples.append(example)

    # Switching from predict_file read to api-read
    # eval_examples = read_squad_examples(
    #     input_file=FLAGS.predict_file, is_training=False,
    #     version_2_with_negative=FLAGS.version_2_with_negative)
    eval_examples = examples
    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)

    convert_examples_to_features(examples=eval_examples[0:],
                                 tokenizer=tokenizer,
                                 max_seq_length=FLAGS.max_seq_length,
                                 doc_stride=FLAGS.doc_stride,
                                 max_query_length=FLAGS.max_query_length,
                                 is_training=False,
                                 output_fn=append_feature)

    protocol_str = 'grpc'  # http or grpc
    url = FLAGS.trtis_server_url
    verbose = True
    model_name = FLAGS.trtis_model_name
    model_version = FLAGS.trtis_model_version
    batch_size = FLAGS.predict_batch_size

    protocol = ProtocolType.from_str(protocol_str)  # or 'grpc'

    ctx = InferContext(url, protocol, model_name, model_version, verbose)

    channel = grpc.insecure_channel(url)

    stub = grpc_service_pb2_grpc.GRPCServiceStub(channel)

    prof_request = grpc_service_pb2.server__status__pb2.model__config__pb2.ModelConfig(
    )

    prof_response = stub.Profile(prof_request)

    status_ctx = ServerStatusContext(url,
                                     protocol,
                                     model_name=model_name,
                                     verbose=verbose)

    model_config_pb2.ModelConfig()

    status_result = status_ctx.get_server_status()

    outstanding = {}
    max_outstanding = 20

    sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features))
    recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features))

    def process_outstanding(do_wait):

        if (len(outstanding) == 0):
            return

        ready_id = ctx.get_ready_async_request(do_wait)

        if (ready_id is None):
            return

        # If we are here, we got an id
        result = ctx.get_async_run_results(ready_id, False)
        stop = time.time()

        if (result is None):
            raise ValueError(
                "Context returned null for async id marked as done")

        outResult = outstanding.pop(ready_id)

        time_list.append(stop - outResult.start_time)

        batch_count = len(outResult.inputs[label_id_key])

        for i in range(batch_count):
            unique_id = int(outResult.inputs[label_id_key][i][0])
            start_logits = [float(x) for x in result["start_logits"][i].flat]
            end_logits = [float(x) for x in result["end_logits"][i].flat]
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

        recv_prog.update(n=batch_count)

    all_results = []
    time_list = []

    print("Starting Sending Requests....\n")

    all_results_start = time.time()

    for inputs_dict in batch(eval_features, batch_size):

        present_batch_size = len(inputs_dict[label_id_key])

        outputs_dict = {
            'start_logits': InferContext.ResultFormat.RAW,
            'end_logits': InferContext.ResultFormat.RAW
        }

        start = time.time()
        async_id = ctx.async_run(inputs_dict,
                                 outputs_dict,
                                 batch_size=present_batch_size)

        outstanding[async_id] = PendingResult(async_id=async_id,
                                              start_time=start,
                                              inputs=inputs_dict)

        sent_prog.update(n=present_batch_size)

        # Try to process at least one response per request
        process_outstanding(len(outstanding) >= max_outstanding)

    tqdm.tqdm.write(
        "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format(
            len(outstanding)))

    # Now process all outstanding requests
    while (len(outstanding) > 0):
        process_outstanding(True)

    all_results_end = time.time()
    all_results_total = (all_results_end - all_results_start) * 1000.0

    print("-----------------------------")
    print("Individual Time Runs - Ignoring first two iterations")
    print("Total Time: {} ms".format(all_results_total))
    print("-----------------------------")

    print("-----------------------------")
    print("Total Inference Time = %0.2f for"
          "Sentences processed = %d" % (sum(time_list), len(eval_features)))
    print("Throughput Average (sentences/sec) = %0.2f" %
          (len(eval_features) / all_results_total * 1000.0))
    print("-----------------------------")

    time_list.sort()

    avg = np.mean(time_list)
    cf_95 = max(time_list[:int(len(time_list) * 0.95)])
    cf_99 = max(time_list[:int(len(time_list) * 0.99)])
    cf_100 = max(time_list[:int(len(time_list) * 1)])
    print("-----------------------------")
    print("Summary Statistics")
    print("Batch size =", FLAGS.predict_batch_size)
    print("Sequence Length =", FLAGS.max_seq_length)
    print("Latency Confidence Level 95 (ms) =", cf_95 * 1000)
    print("Latency Confidence Level 99 (ms)  =", cf_99 * 1000)
    print("Latency Confidence Level 100 (ms)  =", cf_100 * 1000)
    print("Latency Average (ms)  =", avg * 1000)
    print("-----------------------------")
Ejemplo n.º 9
0
class BertTransformer(kfserving.KFModel):
    def __init__(self, name: str, predictor_host: str):
        super().__init__(name)
        self.short_paragraph_text = "The Apollo program was the third United States human spaceflight program. " \
                                    "First conceived as a three-man spacecraft to follow the one-man Project Mercury " \
                                    "which put the first Americans in space, Apollo was dedicated to " \
                                    "President John F. Kennedy's national goal of landing a man on the Moon. " \
                                    "The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972" \
                                    " followed by the Apollo-Soyuz Test Project a joint Earth orbit mission with " \
                                    "the Soviet Union in 1975."

        self.predictor_host = predictor_host
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file="/mnt/models/vocab.txt", do_lower_case=True)
        self.model_name = "bert_tf_v2_large_fp16_128_v2"
        self.model_version = -1
        self.protocol = ProtocolType.from_str('http')
        self.infer_ctx = None

    def preprocess(self, inputs: Dict) -> Dict:
        self.doc_tokens = data_processing.convert_doc_tokens(
            self.short_paragraph_text)
        self.features = data_processing.convert_examples_to_features(
            self.doc_tokens, inputs["instances"][0], self.tokenizer, 128, 128,
            64)
        return self.features

    def predict(self, features: Dict) -> Dict:
        if not self.infer_ctx:
            self.infer_ctx = InferContext(self.predictor_host,
                                          self.protocol,
                                          self.model_name,
                                          self.model_version,
                                          http_headers='',
                                          verbose=True)

        batch_size = 1
        unique_ids = np.int32([1])
        segment_ids = features["segment_ids"]
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        result = self.infer_ctx.run(
            {
                'unique_ids': (unique_ids, ),
                'segment_ids': (segment_ids, ),
                'input_ids': (input_ids, ),
                'input_mask': (input_mask, )
            }, {
                'end_logits': InferContext.ResultFormat.RAW,
                'start_logits': InferContext.ResultFormat.RAW
            }, batch_size)
        return result

    def postprocess(self, result: Dict) -> Dict:
        end_logits = result['end_logits'][0]
        start_logits = result['start_logits'][0]
        n_best_size = 20

        # The maximum length of an answer that can be generated. This is needed
        #  because the start and end predictions are not conditioned on one another
        max_answer_length = 30

        (prediction, nbest_json, scores_diff_json) = \
            data_processing.get_predictions(self.doc_tokens, self.features, start_logits, end_logits, n_best_size,
                                            max_answer_length)
        return {
            "predictions": prediction,
            "prob": nbest_json[0]['probability'] * 100.0
        }
Ejemplo n.º 10
0
                        nargs='?',
                        default=None,
                        help='Input image / Input folder.')
    FLAGS = parser.parse_args()

    protocol = ProtocolType.from_str(FLAGS.protocol)

    if FLAGS.streaming and protocol != ProtocolType.GRPC:
        raise Exception("Streaming is only allowed with gRPC protocol")

    # Make sure the model matches our requirements, and get some
    # properties of the model that we need for preprocessing
    input_name, output_name, c, h, w, format, dtype = parse_model(
        FLAGS.url, protocol, FLAGS.model_name, FLAGS.batch_size, FLAGS.verbose)

    ctx = InferContext(FLAGS.url, protocol, FLAGS.model_name,
                       FLAGS.model_version, FLAGS.verbose, 0, FLAGS.streaming)

    filenames = []
    if os.path.isdir(FLAGS.image_filename):
        filenames = [
            os.path.join(FLAGS.image_filename, f)
            for f in os.listdir(FLAGS.image_filename)
            if os.path.isfile(os.path.join(FLAGS.image_filename, f))
        ]
    else:
        filenames = [
            FLAGS.image_filename,
        ]

    filenames.sort()
Ejemplo n.º 11
0
 def infer(self, request):
     name = self.model_info.architecture
     version = self.model_info.version.ver
     ctx = InferContext(self.SERVER_URI, ProtocolType.GRPC, name, version)
     ctx.run(request[0], request[1], self.batch_size)
Ejemplo n.º 12
0
def main(_):
    """
    Ask a question of context on Triton.
    :param context: str
    :param question: str
    :param question_id: int
    :return:
    """
    os.environ[
        "TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false"  #causes memory fragmentation for bert leading to OOM

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    # Get the Data
    if FLAGS.predict_file:
        eval_examples = read_squad_examples(
            input_file=FLAGS.predict_file,
            is_training=False,
            version_2_with_negative=FLAGS.version_2_with_negative)
    elif FLAGS.question and FLAGS.answer:
        input_data = [{
            "paragraphs": [{
                "context": FLAGS.context,
                "qas": [{
                    "id": 0,
                    "question": FLAGS.question
                }]
            }]
        }]

        eval_examples = read_squad_examples(
            input_file=None,
            is_training=False,
            version_2_with_negative=FLAGS.version_2_with_negative,
            input_data=input_data)
    else:
        raise ValueError(
            "Either predict_file or question+answer need to defined")

    # Get Eval Features = Preprocessing
    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)

    convert_examples_to_features(examples=eval_examples[0:],
                                 tokenizer=tokenizer,
                                 max_seq_length=FLAGS.max_seq_length,
                                 doc_stride=FLAGS.doc_stride,
                                 max_query_length=FLAGS.max_query_length,
                                 is_training=False,
                                 output_fn=append_feature)

    protocol_str = 'grpc'  # http or grpc
    url = FLAGS.triton_server_url
    verbose = True
    model_name = FLAGS.triton_model_name
    model_version = FLAGS.triton_model_version
    batch_size = FLAGS.predict_batch_size

    protocol = ProtocolType.from_str(protocol_str)  # or 'grpc'

    ctx = InferContext(url, protocol, model_name, model_version, verbose)

    status_ctx = ServerStatusContext(url,
                                     protocol,
                                     model_name=model_name,
                                     verbose=verbose)

    model_config_pb2.ModelConfig()

    status_result = status_ctx.get_server_status()
    user_data = UserData()

    max_outstanding = 20
    # Number of outstanding requests
    outstanding = 0

    sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features))
    recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features))

    def process_outstanding(do_wait, outstanding):

        if (outstanding == 0 or do_wait is False):
            return outstanding

        # Wait for deferred items from callback functions
        (infer_ctx, ready_id, idx, start_time,
         inputs) = user_data._completed_requests.get()

        if (ready_id is None):
            return outstanding

        # If we are here, we got an id
        result = ctx.get_async_run_results(ready_id)
        stop = time.time()

        if (result is None):
            raise ValueError(
                "Context returned null for async id marked as done")

        outstanding -= 1

        time_list.append(stop - start_time)

        batch_count = len(inputs[label_id_key])

        for i in range(batch_count):
            unique_id = int(inputs[label_id_key][i][0])
            start_logits = [float(x) for x in result["start_logits"][i].flat]
            end_logits = [float(x) for x in result["end_logits"][i].flat]
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

        recv_prog.update(n=batch_count)
        return outstanding

    all_results = []
    time_list = []

    print("Starting Sending Requests....\n")

    all_results_start = time.time()
    idx = 0
    for inputs_dict in batch(eval_features, batch_size):

        present_batch_size = len(inputs_dict[label_id_key])

        outputs_dict = {
            'start_logits': InferContext.ResultFormat.RAW,
            'end_logits': InferContext.ResultFormat.RAW
        }

        start_time = time.time()
        ctx.async_run(partial(completion_callback, user_data, idx, start_time,
                              inputs_dict),
                      inputs_dict,
                      outputs_dict,
                      batch_size=present_batch_size)
        outstanding += 1
        idx += 1

        sent_prog.update(n=present_batch_size)

        # Try to process at least one response per request
        outstanding = process_outstanding(outstanding >= max_outstanding,
                                          outstanding)

    tqdm.tqdm.write(
        "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format(
            outstanding))

    # Now process all outstanding requests
    while (outstanding > 0):
        outstanding = process_outstanding(True, outstanding)

    all_results_end = time.time()
    all_results_total = (all_results_end - all_results_start) * 1000.0

    print("-----------------------------")
    print("Total Time: {} ms".format(all_results_total))
    print("-----------------------------")

    print("-----------------------------")
    print("Total Inference Time = %0.2f for"
          "Sentences processed = %d" % (sum(time_list), len(eval_features)))
    print("Throughput Average (sentences/sec) = %0.2f" %
          (len(eval_features) / all_results_total * 1000.0))
    print("-----------------------------")

    if FLAGS.output_dir and FLAGS.predict_file:
        # When inferencing on a dataset, get inference statistics and write results to json file
        time_list.sort()

        avg = np.mean(time_list)
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        print("-----------------------------")
        print("Summary Statistics")
        print("Batch size =", FLAGS.predict_batch_size)
        print("Sequence Length =", FLAGS.max_seq_length)
        print("Latency Confidence Level 95 (ms) =", cf_95 * 1000)
        print("Latency Confidence Level 99 (ms)  =", cf_99 * 1000)
        print("Latency Confidence Level 100 (ms)  =", cf_100 * 1000)
        print("Latency Average (ms)  =", avg * 1000)
        print("-----------------------------")

        output_prediction_file = os.path.join(FLAGS.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(FLAGS.output_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(FLAGS.output_dir,
                                                 "null_odds.json")

        write_predictions(eval_examples, eval_features, all_results,
                          FLAGS.n_best_size, FLAGS.max_answer_length,
                          FLAGS.do_lower_case, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file,
                          FLAGS.version_2_with_negative, FLAGS.verbose_logging)
    else:
        # When inferencing on a single example, write best answer to stdout
        all_predictions, all_nbest_json, scores_diff_json = get_predictions(
            eval_examples, eval_features, all_results, FLAGS.n_best_size,
            FLAGS.max_answer_length, FLAGS.do_lower_case,
            FLAGS.version_2_with_negative, FLAGS.verbose_logging)
        print(
            "Context is: %s \n\nQuestion is: %s \n\nPredicted Answer is: %s" %
            (FLAGS.context, FLAGS.question, all_predictions[0]))
Ejemplo n.º 13
0
def init_trtis(url, protocol, model_name, verbose=True):
    ctx = InferContext(url, protocol, model_name, 1)
    return ctx
Ejemplo n.º 14
0
 def close(self):
     self.cidmgr.delete(self.correlation_id())
     # make it work with both 2 and 3 as InferContext does not
     # inherit from object, so super in broken in 2.
     InferContext.close(self)
from tensorrtserver.api import api_pb2
from tensorrtserver.api import grpc_service_pb2
from tensorrtserver.api import grpc_service_pb2_grpc
from tensorrtserver.api import InferContext
from tensorrtserver.api import ProtocolType

from utils.triton_utils import parse_model

if __name__ == '__main__':
    url = "localhost:8001"
    protocol = ProtocolType.from_str("grpc")
    model_name = "pinet_pytorch"
    model_version = 1

    parse_model(url, protocol, model_name)

    ctx = InferContext(url, protocol, model_name, 1, False, 0, False)

    test_image = cv2.imread("./tmp/image1.png")
    test_image = cv2.resize(test_image, (512, 256))
    test_image = test_image[:, :, ::-1]  # bgr to rgb
    test_image = test_image.astype(np.float32) / 255.0
    test_image = np.transpose(test_image, [2, 0, 1])  # HWC to CHW

    ctx.run({"INPUT__0": [test_image]}, {
        "OUTPUT__0": InferContext.ResultFormat.RAW,
        "OUTPUT__1": InferContext.ResultFormat.RAW,
        "OUTPUT__2": InferContext.ResultFormat.RAW
    },
            batch_size=1)