def test_batch_request_for_batching_model(self): input_size = 16 # graphdef_nobatch_int32_int8_int8 is non batching version. # The server should return an error if the batch size dimension # is included in the shape tensor_shape = (1, input_size) for protocol in ["http", "grpc"]: model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8) in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) inputs = [] outputs = [] if protocol == "http": triton_client = tritonhttpclient.InferenceServerClient(url='localhost:8000', verbose=True) inputs.append(tritonhttpclient.InferInput('INPUT0', tensor_shape , "INT32")) inputs.append(tritonhttpclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0')) outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1')) else: triton_client = tritongrpcclient.InferenceServerClient(url='localhost:8001', verbose=True) inputs.append(tritongrpcclient.InferInput('INPUT0', tensor_shape , "INT32")) inputs.append(tritongrpcclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0')) outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1')) # Initialize the data inputs[0].set_data_from_numpy(in0) inputs[1].set_data_from_numpy(in1) results = triton_client.infer(model_name, inputs, outputs=outputs)
def setUp(self): self.model_name_ = "repeat_int32" self.inputs_ = [] self.inputs_.append(grpcclient.InferInput('IN', [1, 1], "INT32")) self.inputs_.append(grpcclient.InferInput('DELAY', [1, 1], "UINT32")) self.inputs_.append(grpcclient.InferInput('WAIT', [1, 1], "UINT32")) self.outputs_ = [] self.outputs_.append(grpcclient.InferRequestedOutput('OUT'))
def infer(self, _need_tensor_check=False, **_input_tensor): inputs = [] assert _input_tensor.keys() == set(self.all_inputs.keys( )), f'{self.model_name} the input tensor not match' for m_name, m_tensor_info in self.all_inputs.items(): m_tensor = _input_tensor[m_name] if not (isinstance(m_tensor, np.ndarray) and m_tensor.dtype.name in self.numpy_data_type_mapper): raise InferenceTensorCheckFailException( f'tensor {m_name} is available numpy array') if _need_tensor_check: check_status, check_result = m_tensor_info.tensor_check( m_tensor, 3 * 10 * 1024 * 1024) if not check_status: raise InferenceTensorCheckFailException(check_result) m_normalized_tensor = m_tensor_info.normalize( m_tensor, _tensor_format='chw').astype(m_tensor.dtype) m_infer_input = tritongrpcclient.InferInput( m_name, m_normalized_tensor.shape, self.numpy_data_type_mapper[m_normalized_tensor.dtype.name]) m_infer_input.set_data_from_numpy(m_normalized_tensor) inputs.append(m_infer_input) results = self.triton_client.infer(model_name=self.model_name, model_version=self.model_version, inputs=inputs) to_return_result = dict() for m_result_name in self.all_outputs.keys(): to_return_result[m_result_name] = results.as_numpy(m_result_name) return to_return_result
def requestGenerator(input_name, output_name, c, h, w, format, dtype, FLAGS): # Preprocess image into input data according to model requirements image_data = None with Image.open(FLAGS.image_filename) as img: image_data = preprocess(img, format, dtype, c, h, w, FLAGS.scaling) repeated_image_data = [image_data for _ in range(FLAGS.batch_size)] batched_image_data = np.stack(repeated_image_data, axis=0) # Set the input data inputs = [] if FLAGS.protocol.lower() == "grpc": inputs.append( tritongrpcclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data) else: inputs.append( tritonhttpclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data, binary_data=False) outputs = [] if FLAGS.protocol.lower() == "grpc": outputs.append( tritongrpcclient.InferRequestedOutput(output_name, class_count=FLAGS.classes)) else: outputs.append( tritonhttpclient.InferRequestedOutput(output_name, binary_data=False, class_count=FLAGS.classes)) yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
def detector(self, frames): infer_inputs = [ triton.InferInput('input_1', (len(frames), 3, *self.resize[::-1]), "FP32") ] frames = np.array(frames, dtype=np.float32) frames = np.transpose(frames, (0, 3, 1, 2)) infer_inputs[0].set_data_from_numpy(frames) result = self.triton_client.infer('retinanet', infer_inputs) scores = result.as_numpy('scores').reshape((-1, 100)) boxes = result.as_numpy('boxes').reshape((-1, 100, 4)) classes = result.as_numpy('classes').reshape((-1, 100)) # Calculate embeddings for all the detected subjects embs = [] scores_filtered = [] boxes_filters = [] for i in range(len(frames)): mask = (scores[i] > 0.4) & ( classes[i] == 0) # only care about 'person' with score > 0.4 scores_i = scores[i, mask] boxes_i = boxes[i, mask] scores_i, boxes_i = self.bbox_filter(scores_i, boxes_i) img = frames[i].astype(np.uint8) # (3, 800, 1280) embs_i = [] boxes_i = boxes_i.astype(int) for j in range(len(boxes_i)): imp = img[:, boxes_i[j, 1]:boxes_i[j, 3], boxes_i[j, 0]:boxes_i[j, 2]] imp = np.transpose(imp, (1, 2, 0)) imp = Image.fromarray(imp) data = [ np.asarray(transforms.Resize(size=(256, 128))(imp)).astype( np.float32) ] inputs = [] inputs.append( tritongrpcclient.InferInput('image', [len(data), 256, 128, 3], "FP32")) # Initialize the data inputs[0].set_data_from_numpy(np.asarray(data)) outputs = [] outputs.append( tritongrpcclient.InferRequestedOutput('features')) results = self.triton_client.infer('osnet_ensemble', inputs, outputs=outputs) emb = np.squeeze(results.as_numpy('features')) embs_i.append(emb / np.linalg.norm(emb)) embs.append(embs_i) scores_filtered.append(scores_i) boxes_filters.append(boxes_i) return np.asarray(scores_filtered), np.asarray( boxes_filters), np.asarray(embs)
def setUp(self): self.trials_ = [("repeat_int32", None), ("simple_repeat", None), ("sequence_repeat", None), ("repeat_square", self._nested_validate), ("nested_square", self._nested_validate)] self.model_name_ = "repeat_int32" self.inputs_ = [] self.inputs_.append(grpcclient.InferInput('IN', [1], "INT32")) self.inputs_.append(grpcclient.InferInput('DELAY', [1], "UINT32")) self.inputs_.append(grpcclient.InferInput('WAIT', [1], "UINT32")) self.outputs_ = [] self.outputs_.append(grpcclient.InferRequestedOutput('OUT')) self.outputs_.append(grpcclient.InferRequestedOutput('IDX')) # Some trials only expect a subset of outputs self.requested_outputs_ = self.outputs_
def _initialize_model(self): input_cfg = self.model_config['config']['input'] output_cfg = self.model_config['config']['output'] input_names = [i['name'] for i in input_cfg] output_names = [o['name'] for o in output_cfg] print('Input layers: ', output_names) print('Output layers: ', output_names) input_dims = [[int(dim) for dim in input_cfg[i]['dims']] for i in range(len(input_cfg))] output_dims = [[int(dim) for dim in output_cfg[i]['dims']] for i in range(len(output_cfg))] self.input_shape = input_dims[0] self.output_dims = output_dims if self.triton_cfg['model']['precision'] == "FP32": mult = 4 elif self.triton_cfg['model']['precision'] == "FP16": mult = 2 # TODO: Fix this elif self.triton_cfg['model']['precision'] == "INT8": mult = 1 # TODO: Fix this else: print("unsupported precision in config file: " + str(self.triton_cfg['model']['precision'])) sys.exit() input_byte_sizes_list = [ self._prod(dims) * mult for dims in input_dims ] output_byte_sizes_list = [ self._prod(dims) * mult for dims in output_dims ] for i in range(len(input_cfg)): shm_region_name = self.model_name + "_input" + str(i) self._register_system_shm_regions(shm_region_name, self.input_handles, input_byte_sizes_list[i], input_names[i]) self.input_layers.append( tritongrpcclient.InferInput( input_names[i], [1, input_dims[i][0], input_dims[i][1], input_dims[i][2]], "FP32")) self.input_layers[-1].set_shared_memory(shm_region_name, input_byte_sizes_list[i]) for i in range(len(output_cfg)): shm_region_name = self.model_name + "_output" + str(i) self._register_system_shm_regions(shm_region_name, self.output_handles, output_byte_sizes_list[i], output_names[i]) self.output_layers.append( tritongrpcclient.InferRequestedOutput(output_names[i])) self.output_layers[-1].set_shared_memory(shm_region_name, output_byte_sizes_list[i])
def main(): FLAGS = parse_args() try: triton_client = tritongrpcclient.InferenceServerClient(url=FLAGS.url, verbose=FLAGS.verbose) except Exception as e: print("channel creation failed: " + str(e)) sys.exit(1) model_name = FLAGS.model_name model_version = -1 print("Loading images") image_data, labels = load_images(FLAGS.img_dir if FLAGS.img_dir is not None else FLAGS.img) image_data = array_from_list(image_data) print("Images loaded, inferring") # Infer outputs = [] input_name = "INPUT" output_name = "OUTPUT" input_shape = list(image_data.shape) outputs.append(tritongrpcclient.InferRequestedOutput(output_name)) img_idx = 0 for batch in batcher(image_data, FLAGS.batch_size): print("Input mean before backend processing:", np.mean(batch)) input_shape[0] = np.shape(batch)[0] print("Batch size: ", input_shape[0]) inputs = [tritongrpcclient.InferInput(input_name, input_shape, "UINT8")] # Initialize the data inputs[0].set_data_from_numpy(batch) # Test with outputs results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Get the output arrays from the results output0_data = results.as_numpy(output_name) print("Output mean after backend processing:", np.mean(output0_data)) print("Output shape: ", np.shape(output0_data)) maxs = np.argmax(output0_data, axis=1) for i in range(len(maxs)): print("Sample ", i, " - label: ", maxs[i], " ~ ", output0_data[i, maxs[i]]) if maxs[i] != labels[img_idx]: sys.exit(1) else: print("pass") img_idx += 1 statistics = triton_client.get_inference_statistics(model_name=model_name) if len(statistics.model_stats) != 1: print("FAILED: Inference Statistics") sys.exit(1)
def main(): FLAGS = parse_args() try: triton_client = tritongrpcclient.InferenceServerClient( url=FLAGS.url, verbose=FLAGS.verbose) except Exception as e: print("channel creation failed: " + str(e)) sys.exit(1) model_name = FLAGS.model_name model_version = -1 input_data = [ randint(0, 255, size=randint(100), dtype='uint8') for _ in range(randint(100) * FLAGS.batch_size) ] input_data = array_from_list(input_data) # Infer outputs = [] input_name = "DALI_INPUT_0" output_name = "DALI_OUTPUT_0" input_shape = list(input_data.shape) outputs.append(tritongrpcclient.InferRequestedOutput(output_name)) for batch in batcher(input_data, FLAGS.batch_size): print("Input mean before backend processing:", np.mean(batch)) input_shape[0] = np.shape(batch)[0] print("Batch size: ", input_shape[0]) inputs = [ tritongrpcclient.InferInput(input_name, input_shape, "UINT8") ] # Initialize the data inputs[0].set_data_from_numpy(batch) # Test with outputs results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Get the output arrays from the results output0_data = results.as_numpy(output_name) print("Output mean after backend processing:", np.mean(output0_data)) print("Output shape: ", np.shape(output0_data)) if not math.isclose(np.mean(output0_data), np.mean(batch)): print("Pre/post average does not match") sys.exit(1) else: print("pass") statistics = triton_client.get_inference_statistics(model_name=model_name) if len(statistics.model_stats) != 1: print("FAILED: Inference Statistics") sys.exit(1)
def test_nobatch_request_for_batching_model(self): input_size = 16 # graphdef_int32_int8_int8 has a batching version with max batch size of 8. # The server should return an error if the batch size is not included in the # input shapes. tensor_shape = (input_size,) for protocol in ["http", "grpc"]: model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8) in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) inputs = [] outputs = [] if protocol == "http": triton_client = tritonhttpclient.InferenceServerClient(url='localhost:8000', verbose=True) inputs.append(tritonhttpclient.InferInput('INPUT0', tensor_shape , "INT32")) inputs.append(tritonhttpclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0')) outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1')) else: triton_client = tritongrpcclient.InferenceServerClient(url='localhost:8001', verbose=True) inputs.append(tritongrpcclient.InferInput('INPUT0', tensor_shape , "INT32")) inputs.append(tritongrpcclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0')) outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1')) # Initialize the data inputs[0].set_data_from_numpy(in0) inputs[1].set_data_from_numpy(in1) try: results = triton_client.infer(model_name, inputs, outputs=outputs) self.assertTrue(False, "expected failure with no batch request for batching model") except InferenceServerException as ex: pass
def _prepare_request(self, protocol): if (protocol == "grpc"): self.inputs_ = [] self.inputs_.append(grpcclient.InferInput('INPUT0', [1, 1], "INT32")) self.outputs_ = [] self.outputs_.append(grpcclient.InferRequestedOutput('OUTPUT0')) else: self.inputs_ = [] self.inputs_.append(httpclient.InferInput('INPUT0', [1, 1], "INT32")) self.outputs_ = [] self.outputs_.append(httpclient.InferRequestedOutput('OUTPUT0')) self.inputs_[0].set_data_from_numpy(self.input0_data_)
def request_eval(hit_data,row_splits, triton_client, model_name): np_rs_type = 'int64' tr_rs_type = 'INT64' inputs = [] outputs = [] #print(hit_data.shape) #print(row_splits.shape) inputs.append(tritongrpcclient.InferInput('input_1', hit_data.shape, 'FP32')) inputs.append(tritongrpcclient.InferInput('input_2', row_splits.shape, tr_rs_type)) #INT64 inputs[0].set_data_from_numpy(hit_data) inputs[1].set_data_from_numpy(row_splits) outputs.append(tritongrpcclient.InferRequestedOutput('output')) outputs.append(tritongrpcclient.InferRequestedOutput('output_1')) #outputs.append(tritongrpcclient.InferRequestedOutput('predicted_final_condensates')) #outputs.append(tritongrpcclient.InferRequestedOutput('output_row_splits')) # predicted_final_1 doesn't matter results = triton_client.infer( model_name=model_name, inputs=inputs, outputs=outputs ) condensates = results.as_numpy('output') #condensates = results.as_numpy('predicted_final_condensates') #rs = results.as_numpy('output_row_splits') #print('output',condensates,condensates.shape) return condensates
def crashing_client(model_name, dtype, tensor_shape, shm_name, triton_client, input_name="INPUT0"): in0 = np.random.random(tensor_shape).astype(dtype) if "libtorch" in model_name: input_name = "INPUT__0" inputs = [ grpcclient.InferInput(input_name, tensor_shape, np_to_triton_dtype(dtype)), ] inputs[0].set_data_from_numpy(in0) # Run in a loop so that it is guaranteed that # the inference will not have completed when being terminated. while True: existing_shm = shared_memory.SharedMemory(shm_name) count = np.ndarray((1,), dtype=np.int32, buffer=existing_shm.buf) count[0] += 1 existing_shm.close() results = triton_client.infer(model_name, inputs)
FLAGS = parser.parse_args() # We use model that takes 1 input tensor containing the delay number of cycles # to occupy an SM model_name = FLAGS.model model_version = "1" # Create the data for the input tensor. input_data = np.array([FLAGS.delay], dtype=np.int32) # Create the inference context for the model. if FLAGS.protocol.lower() == "grpc": triton_client = tritongrpcclient.InferenceServerClient( FLAGS.url, verbose=FLAGS.verbose) inputs = [tritongrpcclient.InferInput('in', input_data.shape, "INT32")] else: triton_client = tritonhttpclient.InferenceServerClient( FLAGS.url, verbose=FLAGS.verbose) inputs = [tritonhttpclient.InferInput('in', input_data.shape, "INT32")] inputs[0].set_data_from_numpy(input_data) # Send N inference requests to the inference server. Time the inference for both # requests start_time = time() for i in range(FLAGS.count): triton_client.async_infer(model_name, inputs, partial(completion_callback),
def main(_): """ Ask a question of context on Triton. :param context: str :param question: str :param question_id: int :return: """ os.environ[ "TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM tf.compat.v1.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format(key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # Get the Data if FLAGS.question and FLAGS.context: input_data = [{ "paragraphs": [{ "context": FLAGS.context, "qas": [{ "id": 0, "question": FLAGS.question }] }] }] eval_examples = read_squad_examples( input_file=None, is_training=False, version_2_with_negative=FLAGS.version_2_with_negative, input_data=input_data) elif FLAGS.predict_file: eval_examples = read_squad_examples( input_file=FLAGS.predict_file, is_training=False, version_2_with_negative=FLAGS.version_2_with_negative) else: raise ValueError( "Either predict_file or question+answer need to defined") # Get Eval Features = Preprocessing eval_features = [] def append_feature(feature): eval_features.append(feature) convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature) protocol_str = 'grpc' # http or grpc url = FLAGS.triton_server_url verbose = False model_name = FLAGS.triton_model_name model_version = str(FLAGS.triton_model_version) batch_size = FLAGS.predict_batch_size triton_client = tritongrpcclient.InferenceServerClient(url, verbose) model_metadata = triton_client.get_model_metadata( model_name=model_name, model_version=model_version) model_config = triton_client.get_model_config(model_name=model_name, model_version=model_version) user_data = UserData() max_outstanding = 20 # Number of outstanding requests outstanding = 0 sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features)) recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features)) def process_outstanding(do_wait, outstanding): if (outstanding == 0 or do_wait is False): return outstanding # Wait for deferred items from callback functions (result, error, idx, start_time, inputs) = user_data._completed_requests.get() if (result is None): return outstanding stop = time.time() if (error is not None): raise ValueError( "Context returned null for async id marked as done") outstanding -= 1 time_list.append(stop - start_time) batch_count = len(inputs[label_id_key]) if FLAGS.trt_engine: cls_squad_logits = result.as_numpy("cls_squad_logits") try: #when batch size > 1 start_logits_results = np.array( cls_squad_logits.squeeze()[:, :, 0]) end_logits_results = np.array(cls_squad_logits.squeeze()[:, :, 1]) except: start_logits_results = np.expand_dims(np.array( cls_squad_logits.squeeze()[:, 0]), axis=0) end_logits_results = np.expand_dims(np.array( cls_squad_logits.squeeze()[:, 1]), axis=0) else: start_logits_results = result.as_numpy("start_logits") end_logits_results = result.as_numpy("end_logits") for i in range(batch_count): unique_id = int(inputs[label_id_key][i][0]) start_logits = [float(x) for x in start_logits_results[i].flat] end_logits = [float(x) for x in end_logits_results[i].flat] all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) recv_prog.update(n=batch_count) return outstanding all_results = [] time_list = [] print("Starting Sending Requests....\n") all_results_start = time.time() idx = 0 for inputs_dict in batch(eval_features, batch_size): present_batch_size = len(inputs_dict[label_id_key]) if not FLAGS.trt_engine: label_ids_data = np.stack(inputs_dict[label_id_key]) input_ids_data = np.stack(inputs_dict['input_ids']) input_mask_data = np.stack(inputs_dict['input_mask']) segment_ids_data = np.stack(inputs_dict['segment_ids']) inputs = [] inputs.append( tritongrpcclient.InferInput('input_ids', input_ids_data.shape, "INT32")) inputs[0].set_data_from_numpy(input_ids_data) inputs.append( tritongrpcclient.InferInput('input_mask', input_mask_data.shape, "INT32")) inputs[1].set_data_from_numpy(input_mask_data) inputs.append( tritongrpcclient.InferInput('segment_ids', segment_ids_data.shape, "INT32")) inputs[2].set_data_from_numpy(segment_ids_data) if not FLAGS.trt_engine: inputs.append( tritongrpcclient.InferInput(label_id_key, label_ids_data.shape, "INT32")) inputs[3].set_data_from_numpy(label_ids_data) outputs = [] if FLAGS.trt_engine: outputs.append( tritongrpcclient.InferRequestedOutput('cls_squad_logits')) else: outputs.append( tritongrpcclient.InferRequestedOutput('start_logits')) outputs.append(tritongrpcclient.InferRequestedOutput('end_logits')) start_time = time.time() triton_client.async_infer(model_name, inputs, partial(completion_callback, user_data, idx, start_time, inputs_dict), request_id=str(idx), model_version=model_version, outputs=outputs) outstanding += 1 idx += 1 sent_prog.update(n=present_batch_size) # Try to process at least one response per request outstanding = process_outstanding(outstanding >= max_outstanding, outstanding) tqdm.tqdm.write( "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format( outstanding)) # Now process all outstanding requests while (outstanding > 0): outstanding = process_outstanding(True, outstanding) all_results_end = time.time() all_results_total = (all_results_end - all_results_start) * 1000.0 print("-----------------------------") print("Total Time: {} ms".format(all_results_total)) print("-----------------------------") print("-----------------------------") print("Total Inference Time = %0.2f for" "Sentences processed = %d" % (sum(time_list), len(eval_features))) print("Throughput Average (sentences/sec) = %0.2f" % (len(eval_features) / all_results_total * 1000.0)) print("-----------------------------") if FLAGS.output_dir and FLAGS.predict_file: # When inferencing on a dataset, get inference statistics and write results to json file time_list.sort() avg = np.mean(time_list) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) print("-----------------------------") print("Summary Statistics") print("Batch size =", FLAGS.predict_batch_size) print("Sequence Length =", FLAGS.max_seq_length) print("Latency Confidence Level 95 (ms) =", cf_95 * 1000) print("Latency Confidence Level 99 (ms) =", cf_99 * 1000) print("Latency Confidence Level 100 (ms) =", cf_100 * 1000) print("Latency Average (ms) =", avg * 1000) print("-----------------------------") output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, FLAGS.version_2_with_negative, FLAGS.verbose_logging) else: # When inferencing on a single example, write best answer to stdout all_predictions, all_nbest_json, scores_diff_json = get_predictions( eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, FLAGS.version_2_with_negative, FLAGS.verbose_logging) print( "Context is: %s \n\nQuestion is: %s \n\nPredicted Answer is: %s" % (FLAGS.context, FLAGS.question, all_predictions[0]))
def req_loop(self): client = grpcclient.InferenceServerClient(self._server_url) inputs = [ grpcclient.InferInput("INPUT0", self._shape, np_to_triton_dtype(self._dtype)) ] self._inflight_requests = 0 start_stat = client.get_inference_statistics( model_name=self._model_name) global _exit_signal while not _exit_signal: input_numpy = np.random.random_sample(self._shape).astype( self._dtype) inputs[0].set_data_from_numpy(input_numpy) self._input_data.append(input_numpy) with self._sync: def _check_can_send(): return self._inflight_requests < _inference_concurrency can_send = self._sync.wait_for(_check_can_send, timeout=_response_wait_time_s) self._tester.assertTrue( can_send, "client didn't receive a response within {}s".format( _response_wait_time_s)) callback = functools.partial(AsyncGrpcRunner._on_result, self) client.async_infer( model_name=self._model_name, inputs=inputs, request_id="{}".format(self._num_sent_request), callback=callback, ) self._inflight_requests += 1 self._num_sent_request += 1 if (self._num_sent_request == _inference_count): _exit_signal = True time.sleep(self._delay_ms / 1000.0) # wait till receive all requested data with self._sync: def _all_processed(): return self._inflight_requests == 0 self._processed_all = self._sync.wait_for(_all_processed, _finish_wait_time_s) self._tester.assertTrue( self._processed_all, "the processing didn't complete even after waiting for {}s". format(_finish_wait_time_s)) end_stat = client.get_inference_statistics(model_name=self._model_name) self._processed_request_count = end_stat.model_stats[ 0].inference_stats.success.count - start_stat.model_stats[ 0].inference_stats.success.count
# Put input data values into shared memory shm.set_shared_memory_region(shm_ip0_handle, [input0_data_serialized]) shm.set_shared_memory_region(shm_ip1_handle, [input1_data_serialized]) # Register Input0 and Input1 shared memory with Triton Server triton_client.register_system_shared_memory("input0_data", "/input0_simple", input0_byte_size) triton_client.register_system_shared_memory("input1_data", "/input1_simple", input1_byte_size) # Set the parameters to use data from shared memory inputs = [] inputs.append(grpcclient.InferInput('INPUT0', [1, 16], "BYTES")) inputs[-1].set_shared_memory("input0_data", input0_byte_size) inputs.append(grpcclient.InferInput('INPUT1', [1, 16], "BYTES")) inputs[-1].set_shared_memory("input1_data", input1_byte_size) outputs = [] outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) outputs[-1].set_shared_memory("output0_data", output0_byte_size) outputs.append(grpcclient.InferRequestedOutput('OUTPUT1')) outputs[-1].set_shared_memory("output1_data", output1_byte_size) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs)
FLAGS = parser.parse_args() try: triton_client = tritongrpcclient.InferenceServerClient( url=FLAGS.url, verbose=FLAGS.verbose) except Exception as e: print("channel creation failed: " + str(e)) sys.exit() model_name = "yolov4" # Infer inputs = [] outputs = [] # the built engine with input NCHW inputs.append(tritongrpcclient.InferInput("data", [1, 3, 608, 608], "FP32")) # Initialize the data image_obj = Image("image_id", raw_image_path=FLAGS.img) ori_w, ori_h = image_obj.pil_image_obj.size image_frame, scale_ratio = preprocess(image_obj.pil_image_obj, input_image_shape=(608, 608)) inputs[0].set_data_from_numpy(image_frame) outputs.append(tritongrpcclient.InferRequestedOutput("prob")) # Test with outputs results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs, headers={"test": "1"})
for npzfile in sorted(glob.glob('/hgcal_testdata/*.npz')): print(npzfile) inputs = [] outputs = [] with np.load(npzfile) as data: x = data['X'].astype(np.float32) edge_index = build_edge_index(x.shape[0], data['Ri_rows'], data['Ri_cols'], data['Ro_rows'], data['Ro_cols']) print(x.shape, edge_index.shape) nnodes = x.shape[0] nedges = edge_index.shape[1] inputs.append(tritongrpcclient.InferInput('x__0', [nnodes, 5], 'FP32')) inputs.append( tritongrpcclient.InferInput('edge_index__1', [2, nedges], "INT64")) inputs[0].set_data_from_numpy(x) inputs[1].set_data_from_numpy(edge_index) outputs.append(tritongrpcclient.InferRequestedOutput('output__0')) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) output0_data = results.as_numpy('output__0') print(output0_data) del output0_data
def check_sequence_async(client_metadata, trial, model_name, input_dtype, steps, timeout_ms=DEFAULT_TIMEOUT_MS, sequence_name="<unknown>"): """Perform sequence of inferences using async run. The 'steps' holds a list of tuples, one for each inference with format: (flag_str, value, expected_result, delay_ms) """ if (("savedmodel" in trial) or ("graphdef" in trial) or ("custom" in trial) or ("plan" in trial)): tensor_shape = ( 1, 1, ) else: assert False, "unknown trial type: " + trial triton_client = client_metadata[0] sequence_id = client_metadata[1] # Execute the sequence of inference... seq_start_ms = int(round(time.time() * 1000)) user_data = UserData() # Ensure there is no running stream triton_client.stop_stream() triton_client.start_stream(partial(completion_callback, user_data)) sent_count = 0 for flag_str, value, expected_result, delay_ms in steps: seq_start = False seq_end = False if flag_str is not None: seq_start = ("start" in flag_str) seq_end = ("end" in flag_str) if input_dtype == np.object_: in0 = np.full(tensor_shape, value, dtype=np.int32) in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(tensor_shape) else: in0 = np.full(tensor_shape, value, dtype=input_dtype) inputs = [ grpcclient.InferInput("INPUT", tensor_shape, np_to_triton_dtype(input_dtype)), ] inputs[0].set_data_from_numpy(in0) triton_client.async_stream_infer(model_name, inputs, sequence_id=sequence_id, sequence_start=seq_start, sequence_end=seq_end) sent_count += 1 if delay_ms is not None: time.sleep(delay_ms / 1000.0) # Process the results in order that they were sent result = None processed_count = 0 while processed_count < sent_count: (results, error) = user_data._completed_requests.get() if error is not None: raise error (_, value, expected, _) = steps[processed_count] processed_count += 1 if timeout_ms != None: now_ms = int(round(time.time() * 1000)) if (now_ms - seq_start_ms) > timeout_ms: raise TimeoutException( "Timeout expired for {}".format(sequence_name)) result = results.as_numpy("OUTPUT")[0][0] if FLAGS.verbose: print("{} {}: + {} = {}".format(sequence_name, sequence_id, value, result)) if expected is not None: if input_dtype == np.object_: assert int( result ) == expected, "{}: expected result {}, got {}".format( sequence_name, expected, int(result)) else: assert result == expected, "{}: expected result {}, got {}".format( sequence_name, expected, result) triton_client.stop_stream()
action="store_true", default=False, help="Use fp16 precision for input data", ) FLAGS = parser.parse_args() triton_client = tritongrpcclient.InferenceServerClient( url=FLAGS.triton_server_url, verbose=FLAGS.verbose ) dataloader = get_data_loader(FLAGS.batch_size, data_path=FLAGS.inference_data) inputs = [] inputs.append( tritongrpcclient.InferInput( "input__0", [FLAGS.batch_size, 3, 224, 224], "FP16" if FLAGS.fp16 else "FP32", ) ) outputs = [] outputs.append(tritongrpcclient.InferRequestedOutput("output__0")) all_img = 0 cor_img = 0 result_prev = None for image, target in tqdm(dataloader): if FLAGS.fp16: image = image.half() inputs[0].set_data_from_numpy(image.numpy())