def test_model_latest_infer(self): input_size = 16 tensor_shape = (1, input_size) platform_name = { 'graphdef': 'tensorflow_graphdef', 'netdef': 'caffe2_netdef' } # There are 3 versions of *_int32_int32_int32 and all # should be available. for platform in ('graphdef', 'netdef'): model_name = platform + "_int32_int32_int32" # Initially there should be no version stats.. try: for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( url=pair[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient( url=pair[0], verbose=True) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) model_metadata = triton_client.get_model_metadata( model_name) # verify all versions are reported when no model version is specified if pair[1] == "http": self.assertEqual(model_name, model_metadata['name']) self.assertEqual(len(model_metadata['versions']), 3) for v in (1, 2, 3): self.assertTrue( str(v) in model_metadata['versions']) else: self.assertEqual(model_name, model_metadata.name) self.assertEqual(len(model_metadata.versions), 3) for v in (1, 2, 3): self.assertTrue(str(v) in model_metadata.versions) # verify contents of model metadata if pair[1] == "http": model_platform = model_metadata['platform'] model_inputs = model_metadata['inputs'] model_outputs = model_metadata['outputs'] else: model_platform = model_metadata.platform model_inputs = model_metadata.inputs model_outputs = model_metadata.outputs self.assertEqual(platform_name[platform], model_platform) self.assertEqual(len(model_inputs), 2) self.assertEqual(len(model_outputs), 2) for model_input in model_inputs: if pair[1] == "http": input_dtype = model_input['datatype'] input_shape = model_input['shape'] input_name = model_input['name'] else: input_dtype = model_input.datatype input_shape = model_input.shape input_name = model_input.name self.assertTrue(input_name in ["INPUT0", "INPUT1"]) self.assertEqual(input_dtype, "INT32") self.assertEqual(input_shape, [16]) for model_output in model_outputs: if pair[1] == "http": output_dtype = model_output['datatype'] output_shape = model_output['shape'] output_name = model_output['name'] else: output_dtype = model_output.datatype output_shape = model_output.shape output_name = model_output.name self.assertTrue(output_name in ["OUTPUT0", "OUTPUT1"]) self.assertEqual(output_dtype, "INT32") self.assertEqual(output_shape, [16]) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Infer using latest version (which is 3)... iu.infer_exact(self, platform, tensor_shape, 1, np.int32, np.int32, np.int32, model_version=None, swap=True) try: for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( url=pair[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient( url=pair[0], verbose=True) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) for v in (1, 2, 3): self.assertTrue( triton_client.is_model_ready(model_name, model_version=str(v))) # Only version 3 should have infer stats infer_stats = triton_client.get_inference_statistics( model_name) if pair[1] == "http": stats = infer_stats['model_stats'] else: stats = infer_stats.model_stats self.assertEqual( len(stats), 3, "expected 3 infer stats for model " + model_name) for s in stats: if pair[1] == "http": v = s['version'] stat = s['inference_stats'] else: v = s.version stat = s.inference_stats if v == "3": if pair[1] == "http": self.assertTrue(stat['success']['count'], 3) else: self.assertTrue(stat.success.count, 3) else: if pair[1] == "http": self.assertEqual( stat['success']['count'], 0, "unexpected infer success counts for version " + str(v) + " of model " + model_name) else: self.assertEqual( stat.success.count, 0, "unexpected infer success counts for version " + str(v) + " of model " + model_name) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
help="Inference server URL. Default is localhost:8001.") parser.add_argument('-v', "--verbose", action="store_true", required=False, default=False, help='Enable verbose output') parser.add_argument( "--label_file", type=str, default="./model_repository/resnet50_trt/labels.txt", help="Path to the file with text representation of available labels") args = parser.parse_args() try: triton_client = tritongrpcclient.InferenceServerClient( url=args.url, verbose=args.verbose) except Exception as e: print("channel creation failed: " + str(e)) sys.exit(1) with open(args.label_file) as f: labels_dict = {idx: line.strip() for idx, line in enumerate(f)} inputs = [] outputs = [] input_name = "INPUT" output_name = "OUTPUT" image_data = load_image(args.image) image_data = np.expand_dims(image_data, axis=0) inputs.append(
help='Protocol (HTTP/gRPC) used to communicate with ' + 'the inference service. Default is HTTP.') parser.add_argument('image_filename', type=str, nargs='?', default=None, help='Input image / Input folder.') FLAGS = parser.parse_args() if FLAGS.streaming and FLAGS.protocol.lower() != "grpc": raise Exception("Streaming is only allowed with gRPC protocol") try: if FLAGS.protocol.lower() == "grpc": # Create gRPC client for communicating with the server triton_client = tritongrpcclient.InferenceServerClient( url=FLAGS.url, verbose=FLAGS.verbose) else: # Create HTTP client for communicating with the server triton_client = tritonhttpclient.InferenceServerClient( url=FLAGS.url, verbose=FLAGS.verbose) except Exception as e: print("client creation failed: " + str(e)) sys.exit(1) # Make sure the model matches our requirements, and get some # properties of the model that we need for preprocessing try: model_metadata = triton_client.get_model_metadata( model_name=FLAGS.model_name, model_version=FLAGS.model_version) except InferenceServerException as e: print("failed to retrieve the metadata: " + str(e))
def infer_exact(tester, pf, tensor_shape, batch_size, input_dtype, output0_dtype, output1_dtype, output0_raw=True, output1_raw=True, model_version=None, swap=False, outputs=("OUTPUT0", "OUTPUT1"), use_http=True, use_grpc=True, use_http_json_tensors=True, skip_request_id_check=False, use_streaming=True, correlation_id=0, shm_region_names=None, precreated_shm_regions=None, use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue(use_http or use_http_json_tensors or use_grpc or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", "http", False, True)) if output0_raw == output1_raw: # Float16 not supported for Input and Output via JSON if use_http_json_tensors and (input_dtype != np.float16) and \ (output0_dtype != np.float16) and (output1_dtype != np.float16): configs.append(("localhost:8000", "http", False, False)) if use_grpc: configs.append(("localhost:8001", "grpc", False, False)) if use_streaming: configs.append(("localhost:8001", "grpc", True, False)) # outputs are sum and difference of inputs so set max input # values so that they will not overflow the output. This # allows us to do an exact match. For float types use 8, 16, # 32 int range for fp 16, 32, 64 respectively. When getting # class outputs the result value/probability is returned as a # float so must use fp32 range in that case. rinput_dtype = _range_repr_dtype(input_dtype) routput0_dtype = _range_repr_dtype( output0_dtype if output0_raw else np.float32) routput1_dtype = _range_repr_dtype( output1_dtype if output1_raw else np.float32) val_min = max( np.iinfo(rinput_dtype).min, np.iinfo(routput0_dtype).min, np.iinfo(routput1_dtype).min) / 2 val_max = min( np.iinfo(rinput_dtype).max, np.iinfo(routput0_dtype).max, np.iinfo(routput1_dtype).max) / 2 num_classes = 3 input0_array = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) input1_array = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) if input_dtype != np.object: input0_array = input0_array.astype(input_dtype) input1_array = input1_array.astype(input_dtype) if not swap: output0_array = input0_array + input1_array output1_array = input0_array - input1_array else: output0_array = input0_array - input1_array output1_array = input0_array + input1_array if output0_dtype == np.object: output0_array = np.array([ unicode(str(x), encoding='utf-8') for x in (output0_array.flatten()) ], dtype=object).reshape(output0_array.shape) else: output0_array = output0_array.astype(output0_dtype) if output1_dtype == np.object: output1_array = np.array([ unicode(str(x), encoding='utf-8') for x in (output1_array.flatten()) ], dtype=object).reshape(output1_array.shape) else: output1_array = output1_array.astype(output1_dtype) if input_dtype == np.object: in0n = np.array( [str(x) for x in input0_array.reshape(input0_array.size)], dtype=object) input0_array = in0n.reshape(input0_array.shape) in1n = np.array( [str(x) for x in input1_array.reshape(input1_array.size)], dtype=object) input1_array = in1n.reshape(input1_array.shape) # prepend size of string to output string data if output0_dtype == np.object: if batch_size == 1: output0_array_tmp = serialize_byte_tensor_list([output0_array]) else: output0_array_tmp = serialize_byte_tensor_list(output0_array) else: output0_array_tmp = output0_array if output1_dtype == np.object: if batch_size == 1: output1_array_tmp = serialize_byte_tensor_list([output1_array]) else: output1_array_tmp = serialize_byte_tensor_list(output1_array) else: output1_array_tmp = output1_array OUTPUT0 = "OUTPUT0" OUTPUT1 = "OUTPUT1" INPUT0 = "INPUT0" INPUT1 = "INPUT1" if pf == "libtorch" or pf == "libtorch_nobatch": OUTPUT0 = "OUTPUT__0" OUTPUT1 = "OUTPUT__1" INPUT0 = "INPUT__0" INPUT1 = "INPUT__1" output0_byte_size = sum([o0.nbytes for o0 in output0_array_tmp]) output1_byte_size = sum([o1.nbytes for o1 in output1_array_tmp]) if batch_size == 1: input0_list = [input0_array] input1_list = [input1_array] else: input0_list = [x for x in input0_array] input1_list = [x for x in input1_array] # Serialization of string tensors in the case of shared memory must be done manually if input_dtype == np.object: input0_list_tmp = serialize_byte_tensor_list(input0_list) input1_list_tmp = serialize_byte_tensor_list(input1_list) else: input0_list_tmp = input0_list input1_list_tmp = input1_list input0_byte_size = sum([i0.nbytes for i0 in input0_list_tmp]) input1_byte_size = sum([i1.nbytes for i1 in input1_list_tmp]) # Create system/cuda shared memory regions if needed shm_regions, shm_handles = su.create_set_shm_regions( input0_list_tmp, input1_list_tmp, output0_byte_size, output1_byte_size, outputs, shm_region_names, precreated_shm_regions, use_system_shared_memory, use_cuda_shared_memory) if model_version is not None: model_version = str(model_version) else: model_version = "" # Run inference and check results for each config for config in configs: model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype) if config[1] == "http": triton_client = httpclient.InferenceServerClient(config[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient(config[0], verbose=True) inputs = [] if config[1] == "http": inputs.append( httpclient.InferInput(INPUT0, tensor_shape, np_to_triton_dtype(input_dtype))) inputs.append( httpclient.InferInput(INPUT1, tensor_shape, np_to_triton_dtype(input_dtype))) else: inputs.append( grpcclient.InferInput(INPUT0, tensor_shape, np_to_triton_dtype(input_dtype))) inputs.append( grpcclient.InferInput(INPUT1, tensor_shape, np_to_triton_dtype(input_dtype))) if not (use_cuda_shared_memory or use_system_shared_memory): if config[1] == "http": inputs[0].set_data_from_numpy(input0_array, binary_data=config[3]) inputs[1].set_data_from_numpy(input1_array, binary_data=config[3]) else: inputs[0].set_data_from_numpy(input0_array) inputs[1].set_data_from_numpy(input1_array) else: # Register necessary shared memory regions/handles su.register_add_shm_regions(inputs, outputs, shm_regions, precreated_shm_regions, shm_handles, input0_byte_size, input1_byte_size, output0_byte_size, output1_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client) if batch_size == 1: expected0_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output0_array.reshape((1, ) + tensor_shape) ] expected1_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output1_array.reshape((1, ) + tensor_shape) ] else: expected0_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output0_array.reshape(tensor_shape) ] expected1_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output1_array.reshape(tensor_shape) ] # Force binary_data = False for shared memory and class output_req = [] i = 0 if "OUTPUT0" in outputs: if len(shm_regions) != 0: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput(OUTPUT0, binary_data=False)) else: output_req.append(grpcclient.InferRequestedOutput(OUTPUT0)) output_req[-1].set_shared_memory(shm_regions[2] + '_data', output0_byte_size) else: if output0_raw: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT0, binary_data=config[3])) else: output_req.append( grpcclient.InferRequestedOutput(OUTPUT0)) else: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT0, binary_data=False, class_count=num_classes)) else: output_req.append( grpcclient.InferRequestedOutput( OUTPUT0, class_count=num_classes)) i += 1 if "OUTPUT1" in outputs: if len(shm_regions) != 0: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput(OUTPUT1, binary_data=False)) else: output_req.append(grpcclient.InferRequestedOutput(OUTPUT1)) output_req[-1].set_shared_memory(shm_regions[2 + i] + '_data', output1_byte_size) else: if output1_raw: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT1, binary_data=config[3])) else: output_req.append( grpcclient.InferRequestedOutput(OUTPUT1)) else: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT1, binary_data=False, class_count=num_classes)) else: output_req.append( grpcclient.InferRequestedOutput( OUTPUT1, class_count=num_classes)) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer( model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id())) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id())) last_response = results.get_response() if not skip_request_id_check: global _seen_request_ids if config[1] == "http": request_id = int(last_response["id"]) else: request_id = int(last_response.id) tester.assertFalse(request_id in _seen_request_ids, "request_id: {}".format(request_id)) _seen_request_ids.add(request_id) if config[1] == "http": response_model_name = last_response["model_name"] if model_version != "": response_model_version = last_response["model_version"] response_outputs = last_response["outputs"] else: response_model_name = last_response.model_name if model_version != "": response_model_version = last_response.model_version response_outputs = last_response.outputs tester.assertEqual(response_model_name, model_name) if model_version != "": tester.assertEqual(str(response_model_version), model_version) tester.assertEqual(len(response_outputs), len(outputs)) for result in response_outputs: if config[1] == "http": result_name = result["name"] else: result_name = result.name if ((result_name == OUTPUT0 and output0_raw) or (result_name == OUTPUT1 and output1_raw)): if use_system_shared_memory or use_cuda_shared_memory: if result_name == OUTPUT0: shm_handle = shm_handles[2] else: shm_handle = shm_handles[3] output = results.get_output(result_name) if config[1] == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = triton_to_np_dtype(output_datatype) if use_system_shared_memory: output_data = shm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) elif use_cuda_shared_memory: output_data = cudashm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) else: output_data = results.as_numpy(result_name) if (output_data.dtype == np.object) and (config[3] == False): output_data = output_data.astype(np.bytes_) if result_name == OUTPUT0: tester.assertTrue( np.array_equal(output_data, output0_array), "{}, {} expected: {}, got {}".format( model_name, OUTPUT0, output0_array, output_data)) elif result_name == OUTPUT1: tester.assertTrue( np.array_equal(output_data, output1_array), "{}, {} expected: {}, got {}".format( model_name, OUTPUT1, output1_array, output_data)) else: tester.assertTrue( False, "unexpected raw result {}".format(result_name)) else: for b in range(batch_size): # num_classes values must be returned and must # match expected top values if "nobatch" in pf: class_list = results.as_numpy(result_name) else: class_list = results.as_numpy(result_name)[b] tester.assertEqual(len(class_list), num_classes) if batch_size == 1: expected0_flatten = output0_array.flatten() expected1_flatten = output1_array.flatten() else: expected0_flatten = output0_array[b].flatten() expected1_flatten = output1_array[b].flatten() for idx, class_label in enumerate(class_list): # can't compare indices since could have different # indices with the same value/prob, so check that # the value of each index equals the expected value. # Only compare labels when the indices are equal. if type(class_label) == str: ctuple = class_label.split(':') else: ctuple = "".join(chr(x) for x in class_label).split(':') cval = float(ctuple[0]) cidx = int(ctuple[1]) if result_name == OUTPUT0: tester.assertEqual(cval, expected0_flatten[cidx]) tester.assertEqual( cval, expected0_flatten[expected0_sort_idx[b][idx]]) if cidx == expected0_sort_idx[b][idx]: tester.assertEqual( ctuple[2], 'label{}'.format( expected0_sort_idx[b][idx])) elif result_name == OUTPUT1: tester.assertEqual(cval, expected1_flatten[cidx]) tester.assertEqual( cval, expected1_flatten[expected1_sort_idx[b][idx]]) else: tester.assertTrue( False, "unexpected class result {}".format( result_name)) # Unregister system/cuda shared memory regions if they exist su.unregister_cleanup_shm_regions(shm_regions, shm_handles, precreated_shm_regions, outputs, use_system_shared_memory, use_cuda_shared_memory) return results
def main(): FLAGS = parse_args() try: triton_client = tritongrpcclient.InferenceServerClient( url=FLAGS.url, verbose=FLAGS.verbose) except Exception as e: print("channel creation failed: " + str(e)) sys.exit(1) if not (triton_client.is_server_live() or triton_client.is_server_ready() or triton_client.is_model_ready(model_name=FLAGS.model_name)): print( "Error connecting to server: Server live {}. Server ready {}. Model ready {}" .format(triton_client.is_server_live, triton_client.is_server_ready, triton_client.is_model_ready(model_name=FLAGS.model_name))) sys.exit(1) model_name = FLAGS.model_name model_version = -1 input_data = [ randint(0, 255, size=randint(100), dtype='uint8') for _ in range(randint(100) * FLAGS.batch_size) ] input_data = array_from_list(input_data) # Infer outputs = [] input_name = "DALI_INPUT_0" output_name = "DALI_OUTPUT_0" input_shape = list(input_data.shape) outputs.append(tritongrpcclient.InferRequestedOutput(output_name)) for batch in batcher(input_data, FLAGS.batch_size): print("Input mean before backend processing:", np.mean(batch)) input_shape[0] = np.shape(batch)[0] print("Batch size: ", input_shape[0]) inputs = [ tritongrpcclient.InferInput(input_name, input_shape, "UINT8") ] # Initialize the data inputs[0].set_data_from_numpy(batch) # Test with outputs results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Get the output arrays from the results output0_data = results.as_numpy(output_name) print("Output mean after backend processing:", np.mean(output0_data)) print("Output shape: ", np.shape(output0_data)) if not math.isclose(np.mean(output0_data), np.mean(batch)): print("Pre/post average does not match") sys.exit(1) else: print("pass") statistics = triton_client.get_inference_statistics(model_name=model_name) if len(statistics.model_stats) != 1: print("FAILED: Inference Statistics") sys.exit(1)
def test_infer_stats_no_model_version(self): # Originally There were 3 versions of *_int32_int32_int32 and # version 3 was executed once. Version 2 and 3 models were # deleted from the model repository so now only expect version 1 to # be ready and version 3 to show stats but not be ready. for platform in ('graphdef', 'netdef'): model_name = platform + "_int32_int32_int32" try: for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( url=pair[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient( url=pair[0], verbose=True) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) model_metadata = triton_client.get_model_metadata( model_name) if pair[1] == "http": self.assertEqual(model_name, model_metadata['name']) self.assertEqual(len(model_metadata['versions']), 1) self.assertEqual("1", model_metadata['versions'][0]) else: self.assertEqual(model_name, model_metadata.name) self.assertEqual(len(model_metadata.versions), 1) self.assertEqual("1", model_metadata.versions[0]) # Only version 3 should have infer stats, only 1 is ready for v in (1, 2, 3): if v == 1: self.assertTrue( triton_client.is_model_ready( model_name, model_version=str(v))) else: self.assertFalse( triton_client.is_model_ready( model_name, model_version=str(v))) infer_stats = triton_client.get_inference_statistics( model_name) if pair[1] == "http": stats = infer_stats['model_stats'] else: stats = infer_stats.model_stats self.assertEqual( len(stats), 3, "expected 3 infer stats for model " + model_name) for s in stats: if pair[1] == "http": version = s['version'] stat = s['inference_stats'] else: version = s.version stat = s.inference_stats if version != "3": if pair[1] == "http": self.assertEqual( stat['success']['count'], 0, "unexpected infer stats for version " + str(v) + " of model " + model_name) else: self.assertEqual( stat.success.count, 0, "unexpected infer stats for version " + str(v) + " of model " + model_name) else: if pair[1] == "http": self.assertTrue(stat['success']['count'], 3) else: self.assertTrue(stat.success.count, 3) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def req_loop(self): client = grpcclient.InferenceServerClient(self._server_url) inputs = [ grpcclient.InferInput("INPUT0", self._shape, np_to_triton_dtype(self._dtype)) ] self._inflight_requests = 0 start_stat = client.get_inference_statistics( model_name=self._model_name) global _exit_signal while not _exit_signal: input_numpy = np.random.random_sample(self._shape).astype( self._dtype) inputs[0].set_data_from_numpy(input_numpy) self._input_data.append(input_numpy) with self._sync: def _check_can_send(): return self._inflight_requests < _inference_concurrency can_send = self._sync.wait_for(_check_can_send, timeout=_response_wait_time_s) self._tester.assertTrue( can_send, "client didn't receive a response within {}s".format( _response_wait_time_s)) callback = functools.partial(AsyncGrpcRunner._on_result, self) client.async_infer( model_name=self._model_name, inputs=inputs, request_id="{}".format(self._num_sent_request), callback=callback, ) self._inflight_requests += 1 self._num_sent_request += 1 if (self._num_sent_request == _inference_count): _exit_signal = True time.sleep(self._delay_ms / 1000.0) # wait till receive all requested data with self._sync: def _all_processed(): return self._inflight_requests == 0 self._processed_all = self._sync.wait_for(_all_processed, _finish_wait_time_s) self._tester.assertTrue( self._processed_all, "the processing didn't complete even after waiting for {}s". format(_finish_wait_time_s)) end_stat = client.get_inference_statistics(model_name=self._model_name) self._processed_request_count = end_stat.model_stats[ 0].inference_stats.success.count - start_stat.model_stats[ 0].inference_stats.success.count
def check_status(model_name): client = grpcclient.InferenceServerClient("localhost:8001", verbose=FLAGS.verbose) stats = client.get_inference_statistics(model_name) print(stats)
def main(_): """ Ask a question of context on Triton. :param context: str :param question: str :param question_id: int :return: """ os.environ[ "TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # Get the Data if FLAGS.predict_file: eval_examples = read_squad_examples( input_file=FLAGS.predict_file, is_training=False, version_2_with_negative=FLAGS.version_2_with_negative) elif FLAGS.question and FLAGS.answer: input_data = [{ "paragraphs": [{ "context": FLAGS.context, "qas": [{ "id": 0, "question": FLAGS.question }] }] }] eval_examples = read_squad_examples( input_file=None, is_training=False, version_2_with_negative=FLAGS.version_2_with_negative, input_data=input_data) else: raise ValueError( "Either predict_file or question+answer need to defined") # Get Eval Features = Preprocessing eval_features = [] def append_feature(feature): eval_features.append(feature) convert_examples_to_features(examples=eval_examples[0:], tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature) protocol_str = 'grpc' # http or grpc url = FLAGS.triton_server_url verbose = False model_name = FLAGS.triton_model_name model_version = str(FLAGS.triton_model_version) batch_size = FLAGS.predict_batch_size triton_client = tritongrpcclient.InferenceServerClient(url, verbose) model_metadata = triton_client.get_model_metadata( model_name=model_name, model_version=model_version) model_config = triton_client.get_model_config(model_name=model_name, model_version=model_version) user_data = UserData() max_outstanding = 20 # Number of outstanding requests outstanding = 0 sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features)) recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features)) def process_outstanding(do_wait, outstanding): if (outstanding == 0 or do_wait is False): return outstanding # Wait for deferred items from callback functions (result, error, idx, start_time, inputs) = user_data._completed_requests.get() if (result is None): return outstanding stop = time.time() if (error is not None): raise ValueError( "Context returned null for async id marked as done") outstanding -= 1 time_list.append(stop - start_time) batch_count = len(inputs[label_id_key]) start_logits_results = result.as_numpy("start_logits") end_logits_results = result.as_numpy("end_logits") for i in range(batch_count): unique_id = int(inputs[label_id_key][i][0]) start_logits = [float(x) for x in start_logits_results[i].flat] end_logits = [float(x) for x in end_logits_results[i].flat] all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) recv_prog.update(n=batch_count) return outstanding all_results = [] time_list = [] print("Starting Sending Requests....\n") all_results_start = time.time() idx = 0 for inputs_dict in batch(eval_features, batch_size): present_batch_size = len(inputs_dict[label_id_key]) label_ids_data = np.stack(inputs_dict[label_id_key]) input_ids_data = np.stack(inputs_dict['input_ids']) input_mask_data = np.stack(inputs_dict['input_mask']) segment_ids_data = np.stack(inputs_dict['segment_ids']) inputs = [] inputs.append( tritongrpcclient.InferInput(label_id_key, label_ids_data.shape, "INT32")) inputs[0].set_data_from_numpy(label_ids_data) inputs.append( tritongrpcclient.InferInput('input_ids', input_ids_data.shape, "INT32")) inputs[1].set_data_from_numpy(input_ids_data) inputs.append( tritongrpcclient.InferInput('input_mask', input_mask_data.shape, "INT32")) inputs[2].set_data_from_numpy(input_mask_data) inputs.append( tritongrpcclient.InferInput('segment_ids', segment_ids_data.shape, "INT32")) inputs[3].set_data_from_numpy(segment_ids_data) outputs = [] outputs.append(tritongrpcclient.InferRequestedOutput('start_logits')) outputs.append(tritongrpcclient.InferRequestedOutput('end_logits')) start_time = time.time() triton_client.async_infer(model_name, inputs, partial(completion_callback, user_data, idx, start_time, inputs_dict), request_id=str(idx), model_version=model_version, outputs=outputs) outstanding += 1 idx += 1 sent_prog.update(n=present_batch_size) # Try to process at least one response per request outstanding = process_outstanding(outstanding >= max_outstanding, outstanding) tqdm.tqdm.write( "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format( outstanding)) # Now process all outstanding requests while (outstanding > 0): outstanding = process_outstanding(True, outstanding) all_results_end = time.time() all_results_total = (all_results_end - all_results_start) * 1000.0 print("-----------------------------") print("Total Time: {} ms".format(all_results_total)) print("-----------------------------") print("-----------------------------") print("Total Inference Time = %0.2f for" "Sentences processed = %d" % (sum(time_list), len(eval_features))) print("Throughput Average (sentences/sec) = %0.2f" % (len(eval_features) / all_results_total * 1000.0)) print("-----------------------------") if FLAGS.output_dir and FLAGS.predict_file: # When inferencing on a dataset, get inference statistics and write results to json file time_list.sort() avg = np.mean(time_list) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) print("-----------------------------") print("Summary Statistics") print("Batch size =", FLAGS.predict_batch_size) print("Sequence Length =", FLAGS.max_seq_length) print("Latency Confidence Level 95 (ms) =", cf_95 * 1000) print("Latency Confidence Level 99 (ms) =", cf_99 * 1000) print("Latency Confidence Level 100 (ms) =", cf_100 * 1000) print("Latency Average (ms) =", avg * 1000) print("-----------------------------") output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, FLAGS.version_2_with_negative, FLAGS.verbose_logging) else: # When inferencing on a single example, write best answer to stdout all_predictions, all_nbest_json, scores_diff_json = get_predictions( eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, FLAGS.version_2_with_negative, FLAGS.verbose_logging) print( "Context is: %s \n\nQuestion is: %s \n\nPredicted Answer is: %s" % (FLAGS.context, FLAGS.question, all_predictions[0]))
def main(): FLAGS = parse_args() try: triton_client = tritongrpcclient.InferenceServerClient( url=FLAGS.url, verbose=FLAGS.verbose) except Exception as e: print("channel creation failed: " + str(e)) sys.exit(1) model_name = FLAGS.model_name model_version = -1 print("Loading images") image_data, labels = load_images( FLAGS.img_dir if FLAGS.img_dir is not None else FLAGS.img) image_data = array_from_list(image_data) print("Images loaded, inferring") # Infer inputs = [] outputs = [] input_name = "INPUT" output_name = "OUTPUT" input_shape = list(image_data.shape) input_shape[0] = FLAGS.batch_size inputs.append(tritongrpcclient.InferInput(input_name, input_shape, "UINT8")) outputs.append(tritongrpcclient.InferRequestedOutput(output_name)) img_idx = 0 for batch in batcher(image_data, FLAGS.batch_size): print("Input mean before backend processing:", np.mean(batch)) # Initialize the data inputs[0].set_data_from_numpy(batch) start = time.perf_counter() # Test with outputs results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Get the output arrays from the results output0_data = results.as_numpy(output_name) end = time.perf_counter() - start print("latency: {:.6}ms".format(end * 1000)) print("Output mean after backend processing:", np.mean(output0_data)) print("Output shape: ", np.shape(output0_data)) maxs = np.argmax(output0_data, axis=1) for i in range(len(maxs)): print("Sample ", i, " - label: ", maxs[i], " ~ ", output0_data[i, maxs[i]]) if maxs[i] != labels[img_idx]: sys.exit(1) else: print("pass") img_idx += 1 statistics = triton_client.get_inference_statistics(model_name=model_name) if len(statistics.model_stats) != 1: print("FAILED: Inference Statistics") sys.exit(1)
def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name, dtype): # Thread responsible for generating sequences of inference # requests. global _thread_exceptions print("Starting thread {} with seed {}".format(name, seed)) rng = np.random.RandomState(seed) client_metadata_list = [] try: # Must use streaming GRPC context to ensure each sequences' # requests are received in order. Create 2 common-use contexts # with different correlation IDs that are used for most # inference requests. Also create some rare-use contexts that # are used to make requests with rarely-used correlation IDs. # # Need to remember the last choice for each context since we # don't want some choices to follow others since that gives # results not expected. See below for details. common_cnt = 2 rare_cnt = 8 last_choices = [] for c in range(common_cnt + rare_cnt): client_metadata_list.append( (grpcclient.InferenceServerClient("localhost:8001", verbose=FLAGS.verbose), correlation_id_base + c)) last_choices.append(None) rare_idx = 0 for p in range(pass_cnt): # Common or rare context? if rng.rand() < 0.1: # Rare context... choice = rng.rand() client_idx = common_cnt + rare_idx # Send a no-end, valid-no-end or valid-valid # sequence... because it is a rare context this should # exercise the idle sequence path of the sequence # scheduler if choice < 0.33: sequence_no_end(client_metadata_list[client_idx], rng, trial, model_name, dtype, SEQUENCE_LENGTH_MEAN, SEQUENCE_LENGTH_STDEV, sequence_name=name) last_choices[client_idx] = "no-end" elif choice < 0.66: sequence_valid_no_end(client_metadata_list[client_idx], rng, trial, model_name, dtype, SEQUENCE_LENGTH_MEAN, SEQUENCE_LENGTH_STDEV, sequence_name=name) last_choices[client_idx] = "valid-no-end" else: sequence_valid_valid(client_metadata_list[client_idx], rng, trial, model_name, dtype, SEQUENCE_LENGTH_MEAN, SEQUENCE_LENGTH_STDEV, sequence_name=name) last_choices[client_idx] = "valid-valid" rare_idx = (rare_idx + 1) % rare_cnt else: # Common context... client_idx = 0 if rng.rand() < 0.5 else 1 client_metadata = client_metadata_list[client_idx] last_choice = last_choices[client_idx] choice = rng.rand() # no-start cannot follow no-end since the server will # just assume that the no-start is a continuation of # the no-end sequence instead of being a sequence # missing start flag. if ((last_choice != "no-end") and (last_choice != "valid-no-end") and (choice < 0.01)): sequence_no_start(client_metadata, rng, trial, model_name, dtype, sequence_name=name) last_choices[client_idx] = "no-start" elif choice < 0.05: sequence_no_end(client_metadata, rng, trial, model_name, dtype, SEQUENCE_LENGTH_MEAN, SEQUENCE_LENGTH_STDEV, sequence_name=name) last_choices[client_idx] = "no-end" elif choice < 0.10: sequence_valid_no_end(client_metadata, rng, trial, model_name, dtype, SEQUENCE_LENGTH_MEAN, SEQUENCE_LENGTH_STDEV, sequence_name=name) last_choices[client_idx] = "valid-no-end" elif choice < 0.15: sequence_valid_valid(client_metadata, rng, trial, model_name, dtype, SEQUENCE_LENGTH_MEAN, SEQUENCE_LENGTH_STDEV, sequence_name=name) last_choices[client_idx] = "valid-valid" else: sequence_valid(client_metadata, rng, trial, model_name, dtype, SEQUENCE_LENGTH_MEAN, SEQUENCE_LENGTH_STDEV, sequence_name=name) last_choices[client_idx] = "valid" except Exception as ex: _thread_exceptions_mutex.acquire() try: _thread_exceptions.append(traceback.format_exc()) finally: _thread_exceptions_mutex.release() # We need to explicitly close each client so that streams get # cleaned up and closed correctly, otherwise the application # can hang when exiting. for c, i in client_metadata_list: print("thread {} closing client {}".format(name, i)) c.close() print("Exiting thread {}".format(name))
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-t', '--trial', type=str, required=True, help='Set trial for the crashing client') FLAGS = parser.parse_args() trial = FLAGS.trial dtype = np.float32 model_name = tu.get_zero_model_name(trial, 1, dtype) tensor_shape = (1,) if "nobatch" in trial else (1, 1) triton_client = grpcclient.InferenceServerClient(url="localhost:8001", verbose=True) shm = shared_memory.SharedMemory(create=True, size=8) count = np.ndarray((1,), dtype=np.int32, buffer=shm.buf) count[0] = 0 p = Process(target=crashing_client, name="crashing_client", args=( model_name, dtype, tensor_shape, shm.name, triton_client, ))
def stress_thread(name, seed, test_duration, correlation_id_base, test_case_count, failed_test_case_count, sequence_request_count): # Thread responsible for generating sequences of inference # requests. global _thread_exceptions print("Starting thread {} with seed {}".format(name, seed)) rng = np.random.RandomState(seed) # FIXME revisit to check if it is necessary client_metadata_list = [] # Must use streaming GRPC context to ensure each sequences' # requests are received in order. Create 2 common-use contexts # with different correlation IDs that are used for most # inference requests. Also create some rare-use contexts that # are used to make requests with rarely-used correlation IDs. # # Need to remember if the last sequence case runs on each model # is no-end cases since we don't want some choices to follow others # since that gives results not expected. See below for details. common_cnt = 2 rare_cnt = 8 is_last_used_no_end = {} update_counter_fn = partial(update_test_count, test_case_count, failed_test_case_count, sequence_request_count) for c in range(common_cnt + rare_cnt): client_metadata_list.append( (grpcclient.InferenceServerClient("localhost:8001", verbose=FLAGS.verbose), correlation_id_base + c)) pa_start_seq_id = correlation_id_base + common_cnt + rare_cnt pa_end_seq_id = pa_start_seq_id + CORRELATION_ID_BLOCK_SIZE # Weight roughly in thousandth percent ss = ScenarioSelector([ (60, TimeoutScenario(name, get_trials(False), verbose=FLAGS.verbose)), (80, ResNetScenario(name, verbose=FLAGS.verbose)), (60, CrashingScenario(name, verbose=FLAGS.verbose)), (62, SequenceNoEndScenario(name, get_trials(), rng, is_last_used_no_end, verbose=FLAGS.verbose)), (68, SequenceValidNoEndScenario(name, get_trials(), rng, is_last_used_no_end, verbose=FLAGS.verbose)), (68, SequenceValidValidScenario(name, get_trials(), rng, is_last_used_no_end, verbose=FLAGS.verbose)), (7, SequenceNoStartScenario(name, get_trials(), rng, is_last_used_no_end, verbose=FLAGS.verbose)), (295, SequenceValidScenario(name, get_trials(), rng, is_last_used_no_end, verbose=FLAGS.verbose)), (300, PerfAnalyzerScenario( name, rng, get_trials(), get_trials(False), sequence_id_range=(pa_start_seq_id, pa_end_seq_id), verbose=FLAGS.verbose)), ], rng) rare_idx = 0 common_idx = 0 start_time = time.time() while time.time() - start_time < test_duration: scenario = ss.get_scenario() # FIXME generating 'is_rare' for now as some scenario uses it to select # client context, but we may not need this if we roll forward the sequence id if rng.rand() < 0.1: client_idx = common_cnt + rare_idx rare_idx = (rare_idx + 1) % rare_cnt else: client_idx = common_idx common_idx = (common_idx + 1) % common_cnt try: res = scenario.run(client_metadata_list[client_idx]) if res is not None: update_counter_fn(scenario.scenario_name(), count=res) except Exception as ex: update_counter_fn(scenario.scenario_name(), False) _thread_exceptions_mutex.acquire() try: _thread_exceptions.append(traceback.format_exc()) finally: _thread_exceptions_mutex.release() # We need to explicitly close each client so that streams get # cleaned up and closed correctly, otherwise the application # can hang when exiting. for c, i in client_metadata_list: print("thread {} closing client {}".format(name, i)) c.close() print("Exiting thread {}".format(name))
def test_nobatch_request_for_batching_model(self): input_size = 16 # graphdef_int32_int8_int8 has a batching version with max batch size of 8. # The server should return an error if the batch size is not included in the # input shapes. tensor_shape = (input_size, ) for protocol in ["http", "grpc"]: model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8) in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) inputs = [] outputs = [] if protocol == "http": triton_client = tritonhttpclient.InferenceServerClient( url='localhost:8000', verbose=True) inputs.append( tritonhttpclient.InferInput('INPUT0', tensor_shape, "INT32")) inputs.append( tritonhttpclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT0')) outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT1')) else: triton_client = tritongrpcclient.InferenceServerClient( url='localhost:8001', verbose=True) inputs.append( tritongrpcclient.InferInput('INPUT0', tensor_shape, "INT32")) inputs.append( tritongrpcclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append( tritongrpcclient.InferRequestedOutput('OUTPUT0')) outputs.append( tritongrpcclient.InferRequestedOutput('OUTPUT1')) # Initialize the data inputs[0].set_data_from_numpy(in0) inputs[1].set_data_from_numpy(in1) try: results = triton_client.infer(model_name, inputs, outputs=outputs) self.assertTrue( False, "expected failure with no batch request for batching model" ) except InferenceServerException as ex: pass
def test_model_specific_infer(self): input_size = 16 tensor_shape = (1, input_size) # There are 3 versions of *_float32_float32_float32 but only # versions 1 and 3 should be available. for platform in ('graphdef', 'netdef', 'plan'): tensor_shape = (1, input_size) model_name = platform + "_float32_float32_float32" # Initially there should be no version status... try: for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( url=pair[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient( url=pair[0], verbose=True) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) self.assertTrue( triton_client.is_model_ready(model_name, model_version="1")) self.assertFalse( triton_client.is_model_ready(model_name, model_version="2")) self.assertTrue( triton_client.is_model_ready(model_name, model_version="3")) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Infer using version 1... iu.infer_exact(self, platform, tensor_shape, 1, np.float32, np.float32, np.float32, model_version=1, swap=False) try: for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( url=pair[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient( url=pair[0], verbose=True) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) self.assertTrue( triton_client.is_model_ready(model_name, model_version="1")) self.assertFalse( triton_client.is_model_ready(model_name, model_version="2")) self.assertTrue( triton_client.is_model_ready(model_name, model_version="3")) # Only version 1 should have infer stats infer_stats = triton_client.get_inference_statistics( model_name, model_version='1') if pair[1] == "http": self.assertEqual( len(infer_stats['model_stats']), 1, "expected 1 infer stats for version 1" " of model " + model_name) stats = infer_stats['model_stats'][0][ 'inference_stats'] self.assertTrue(stats['success']['count'], 3) else: self.assertEqual( len(infer_stats.model_stats), 1, "expected 1 infer stats for version 1" " of model " + model_name) stats = infer_stats.model_stats[0].inference_stats self.assertTrue(stats.success.count, 3) infer_stats = triton_client.get_inference_statistics( model_name, model_version='3') if pair[1] == "http": stats = infer_stats['model_stats'][0][ 'inference_stats'] self.assertEqual( stats['success']['count'], 0, "unexpected infer stats for version 3" " of model " + model_name) else: stats = infer_stats.model_stats[0].inference_stats self.assertEqual( stats.success.count, 0, "unexpected infer stats for version 3" " of model " + model_name) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def _decoupled_infer(self, request_count, repeat_count=1, data_offset=100, delay_time=1000, wait_time=500): # Initialize data for IN input_data = np.arange(start=data_offset, stop=data_offset + repeat_count, dtype=np.int32) input_data = np.expand_dims(input_data, axis=0) self.inputs_[0].set_shape([1, repeat_count]) self.inputs_[0].set_data_from_numpy(input_data) # Initialize data for DELAY delay_data = (np.ones([1, repeat_count], dtype=np.uint32)) * delay_time self.inputs_[1].set_shape([1, repeat_count]) self.inputs_[1].set_data_from_numpy(delay_data) # Initialize data for WAIT wait_data = np.array([[wait_time]], dtype=np.uint32) self.inputs_[2].set_data_from_numpy(wait_data) user_data = UserData() result_dict = {} with grpcclient.InferenceServerClient(url="localhost:8001", verbose=True) as triton_client: # Establish stream triton_client.start_stream(callback=partial(callback, user_data)) # Send specified many requests in parallel for i in range(request_count): triton_client.async_stream_infer(model_name=self.model_name_, inputs=self.inputs_, request_id=str(i), outputs=self.outputs_) # Retrieve results... recv_count = 0 while recv_count < (repeat_count * request_count): data_item = user_data._completed_requests.get() if type(data_item) == InferenceServerException: raise data_item else: this_id = data_item.get_response().id if this_id not in result_dict.keys(): result_dict[this_id] = [] result_dict[this_id].append(data_item.as_numpy('OUT')) recv_count += 1 # Validate the results.. for i in range(request_count): this_id = str(i) if repeat_count != 0 and this_id not in result_dict.keys(): self.assertTrue( False, "response for request id {} not received".format(this_id)) elif repeat_count == 0 and this_id in result_dict.keys(): self.assertTrue( False, "received unexpected response for request id {}".format( this_id)) if repeat_count != 0: self.assertEqual(len(result_dict[this_id]), repeat_count) expected_data = data_offset result_list = result_dict[this_id] for j in range(len(result_list)): self.assertEqual(len(result_list[j]), 1) self.assertEqual(result_list[j][0], expected_data) expected_data += 1
def test_model_versions_added(self): # Originally There was version 1 of *_float16_float32_float32. # Version 7 was added so now expect just version 7 to be ready. for platform in ('graphdef', ): model_name = platform + "_float16_float32_float32" try: for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( url=pair[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient( url=pair[0], verbose=True) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) model_metadata = triton_client.get_model_metadata( model_name) if pair[1] == "http": self.assertEqual( model_name, model_metadata['name'], "expected status for model " + model_name) self.assertEqual( len(model_metadata['versions']), 1, "expected status for 1 versions for model " + model_name) self.assertEqual("7", model_metadata['versions'][0]) else: self.assertEqual( model_name, model_metadata.name, "expected status for model " + model_name) self.assertEqual( len(model_metadata.versions), 1, "expected status for 1 versions for model " + model_name) self.assertEqual("7", model_metadata.versions[0]) # Only version 7 should be ready. Neither should have infer stats for v in (1, 7): infer_stats = triton_client.get_inference_statistics( model_name, model_version=str(v)) if v == 7: self.assertTrue( triton_client.is_model_ready( model_name, model_version=str(v))) else: self.assertFalse( triton_client.is_model_ready( model_name, model_version=str(v))) if pair[1] == "http": stats = infer_stats['model_stats'][0][ 'inference_stats'] self.assertEqual( stats['success']['count'], 0, "unexpected infer stats for version " + str(v) + " of model " + model_name) else: stats = infer_stats.model_stats[0].inference_stats self.assertEqual( stats.success.count, 0, "unexpected infer stats for version " + str(v) + " of model " + model_name) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes, model_version=None, use_http=True, use_grpc=True, use_http_json_tensors=True, use_streaming=True, shm_region_name_prefix=None, use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue(use_http or use_grpc or use_http_json_tensors or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", "http", False, True)) if use_http_json_tensors and (tensor_dtype != np.float16): configs.append(("localhost:8000", "http", False, False)) if use_grpc: configs.append(("localhost:8001", "grpc", False, False)) if use_streaming: configs.append(("localhost:8001", "grpc", True, False)) tester.assertEqual(len(input_shapes), len(output_shapes)) io_cnt = len(input_shapes) if shm_region_name_prefix is None: shm_region_name_prefix = ["input", "output"] input_dict = {} expected_dict = {} shm_ip_handles = list() shm_op_handles = list() for io_num in range(io_cnt): if pf == "libtorch" or pf == "libtorch_nobatch": input_name = "INPUT__{}".format(io_num) output_name = "OUTPUT__{}".format(io_num) else: input_name = "INPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) input_shape = input_shapes[io_num] output_shape = output_shapes[io_num] rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=input_shape, dtype=rtensor_dtype) else: input_array = np.random.choice(a=[False, True], size=input_shape) if tensor_dtype != np.object: input_array = input_array.astype(tensor_dtype) expected_array = np.ndarray.copy(input_array) else: expected_array = np.array([ unicode(str(x), encoding='utf-8') for x in input_array.flatten() ], dtype=object) input_array = np.array([str(x) for x in input_array.flatten()], dtype=object).reshape(input_array.shape) expected_array = expected_array.reshape(output_shape) expected_dict[output_name] = expected_array output_byte_size = expected_array.nbytes if batch_size == 1: input_list = [input_array] else: input_list = [x for x in input_array] # Serialization of string tensors in the case of shared memory must be done manually if tensor_dtype == np.object: input_list_tmp = serialize_byte_tensor_list(input_list) else: input_list_tmp = input_list input_byte_size = sum([ip.nbytes for ip in input_list_tmp]) # create and register shared memory region for inputs and outputs shm_io_handles = su.create_set_either_shm_region( [ shm_region_name_prefix[0] + str(io_num), shm_region_name_prefix[1] + str(io_num) ], input_list_tmp, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory) if len(shm_io_handles) != 0: shm_ip_handles.append(shm_io_handles[0]) shm_op_handles.append(shm_io_handles[1]) input_dict[input_name] = input_array if model_version is not None: model_version = str(model_version) else: model_version = "" # Run inference and check results for each config for config in configs: model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) if config[1] == "http": triton_client = httpclient.InferenceServerClient(config[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient(config[0], verbose=True) inputs = [] output_req = [] for io_num, (input_name, output_name) in enumerate( zip(input_dict.keys(), expected_dict.keys())): input_data = input_dict[input_name] input_byte_size = input_data.nbytes output_byte_size = expected_dict[output_name].nbytes if config[1] == "http": inputs.append( httpclient.InferInput(input_name, input_data.shape, np_to_triton_dtype(tensor_dtype))) output_req.append( httpclient.InferRequestedOutput(output_name, binary_data=config[3])) else: inputs.append( grpcclient.InferInput(input_name, input_data.shape, np_to_triton_dtype(tensor_dtype))) output_req.append(grpcclient.InferRequestedOutput(output_name)) if not (use_cuda_shared_memory or use_system_shared_memory): if config[1] == "http": inputs[-1].set_data_from_numpy(input_data, binary_data=config[3]) else: inputs[-1].set_data_from_numpy(input_data) else: # Register necessary shared memory regions/handles su.register_add_either_shm_regions( inputs, output_req, shm_region_name_prefix, (shm_ip_handles, shm_op_handles), io_num, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer( model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id()), priority=priority, timeout=timeout_us) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id()), priority=priority, timeout=timeout_us) last_response = results.get_response() if config[1] == "http": response_model_name = last_response["model_name"] if model_version != "": response_model_version = last_response["model_version"] response_outputs = last_response["outputs"] else: response_model_name = last_response.model_name if model_version != "": response_model_version = last_response.model_version response_outputs = last_response.outputs tester.assertEqual(response_model_name, model_name) if model_version != "": tester.assertEqual(response_model_version, model_version) tester.assertEqual(len(response_outputs), io_cnt) for result in response_outputs: if config[1] == "http": result_name = result["name"] else: result_name = result.name tester.assertTrue(result_name in expected_dict) if use_system_shared_memory or use_cuda_shared_memory: if pf == "libtorch" or pf == "libtorch_nobatch": io_num = int(result_name.split("OUTPUT__")[1]) else: io_num = int(result_name.split("OUTPUT")[1]) shm_handle = shm_op_handles[io_num] output = results.get_output(result_name) if config[1] == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = triton_to_np_dtype(output_datatype) if use_system_shared_memory: output_data = shm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) elif use_cuda_shared_memory: output_data = cudashm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) else: output_data = results.as_numpy(result_name) if (output_data.dtype == np.object) and (config[3] == False): output_data = output_data.astype(np.bytes_) expected = expected_dict[result_name] tester.assertEqual(output_data.shape, expected.shape) tester.assertTrue( np.array_equal(output_data, expected), "{}, {}, expected: {}, got {}".format(model_name, result_name, expected, output_data)) if len(shm_ip_handles) != 0: for io_num in range(io_cnt): if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory( shm_region_name_prefix[0] + str(io_num) + '_data') triton_client.unregister_cuda_shared_memory( shm_region_name_prefix[0] + str(io_num) + '_data') cudashm.destroy_shared_memory_region(shm_ip_handles[io_num]) cudashm.destroy_shared_memory_region(shm_op_handles[io_num]) else: triton_client.unregister_system_shared_memory( shm_region_name_prefix[1] + str(io_num) + '_data') triton_client.unregister_system_shared_memory( shm_region_name_prefix[1] + str(io_num) + '_data') shm.destroy_shared_memory_region(shm_ip_handles[io_num]) shm.destroy_shared_memory_region(shm_op_handles[io_num]) return results
def setUp(self): # The helper client for setup will be GRPC for simplicity. self.triton_client_ = grpcclient.InferenceServerClient("localhost:8001") self.clear_deferred_exceptions()
parser = argparse.ArgumentParser() parser.add_argument('--expected_dir', type=str, required=True, help='Directory containing expected output files') parser.add_argument('--model', type=str, required=True, help='Model name') FLAGS, unparsed = parser.parse_known_args() for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: model_name = FLAGS.model if pair[1] == "http": triton_client = httpclient.InferenceServerClient(url=pair[0], verbose=False) model_config = triton_client.get_model_config(model_name) else: triton_client = grpcclient.InferenceServerClient(url=pair[0], verbose=False) model_config = triton_client.get_model_config(model_name) nonmatch = list() expected_files = [ f for f in os.listdir(FLAGS.expected_dir) if (os.path.isfile(os.path.join(FLAGS.expected_dir, f)) and ( f.startswith("expected"))) ] for efile in expected_files: with open(os.path.join(FLAGS.expected_dir, efile)) as f: config = text_format.Parse(f.read(), mc.ModelConfig()) if pair[1] == "http": config_json = json.loads( json_format.MessageToJson(