def test_inference_client_generated_request_binary(self): inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) input_data = np.expand_dims(input_data, axis=0) inputs[0].set_data_from_numpy(input_data, binary_data=True) inputs[1].set_data_from_numpy(input_data, binary_data=False) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) outputs.append( httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) request_body, header_length = httpclient.InferenceServerClient.generate_request_body( inputs, outputs=outputs) headers = { 'Content-Type': 'application/vnd.sagemaker-triton.binary+json;json-header-size={}' .format(header_length) } r = requests.post(self.url_, data=request_body, headers=headers) r.raise_for_status() self.assertEqual( self.expected_result_, r.json(), "Expected response body: {}; got: {}".format( self.expected_result_, r.json()))
def test_inference_client_generated_response_binary(self): inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) input_data = np.expand_dims(input_data, axis=0) inputs[0].set_data_from_numpy(input_data, binary_data=False) inputs[1].set_data_from_numpy(input_data, binary_data=False) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs.append( httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) request_body, _ = httpclient.InferenceServerClient.generate_request_body( inputs, outputs=outputs) headers = {'Content-Type': 'application/json'} r = requests.post(self.url_, data=request_body, headers=headers) r.raise_for_status() header_length_prefix = "application/vnd.sagemaker-triton.binary+json;json-header-size=" header_length_str = r.headers['Content-Type'][len(header_length_prefix ):] result = httpclient.InferenceServerClient.parse_response_body( r._content, header_length=int(header_length_str)) output0_data = result.as_numpy('OUTPUT0') output1_data = result.as_numpy('OUTPUT1') for i in range(16): self.assertEqual(output0_data[0][i], self.expected_output0_data_[i]) self.assertEqual(output1_data[0][i], self.expected_output1_data_[i])
def test_infer_no_outputs(model_name, input0_data, input1_data, headers=None, request_compression_algorithm=None, response_compression_algorithm=None): inputs = [] inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) # Initialize the data inputs[0].set_data_from_numpy(input0_data, binary_data=False) inputs[1].set_data_from_numpy(input1_data, binary_data=True) query_params = {'test_1': 1, 'test_2': 2} results = triton_client.infer( model_name, inputs, outputs=None, query_params=query_params, headers=headers, request_compression_algorithm=request_compression_algorithm, response_compression_algorithm=response_compression_algorithm) return results
def test_malformed_binary_header_large_number(self): inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) input_data = np.expand_dims(input_data, axis=0) inputs[0].set_data_from_numpy(input_data, binary_data=True) inputs[1].set_data_from_numpy(input_data, binary_data=False) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) outputs.append( httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) request_body, header_length = httpclient.InferenceServerClient.generate_request_body( inputs, outputs=outputs) headers = { 'Content-Type': 'application/vnd.sagemaker-triton.binary+json;json-header-size=12345' } r = requests.post(self.url_, data=request_body, headers=headers) self.assertEqual( 400, r.status_code, "Expected error code {} returned for the request; got: {}".format( 400, r.status_code))
def test_predict(self): inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) input_data = np.expand_dims(input_data, axis=0) inputs[0].set_data_from_numpy(input_data, binary_data=False) inputs[1].set_data_from_numpy(input_data, binary_data=False) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) outputs.append( httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) request_body, _ = httpclient.InferenceServerClient.generate_request_body( inputs, outputs=outputs) headers = {'Content-Type': 'application/json'} r = requests.post(self.url_, data=request_body, headers=headers) r.raise_for_status() result = httpclient.InferenceServerClient.parse_response_body( r._content) output0_data = result.as_numpy('OUTPUT0') output1_data = result.as_numpy('OUTPUT1') for i in range(16): self.assertEqual(output0_data[0][i], self.expected_output0_data_[i]) self.assertEqual(output1_data[0][i], self.expected_output1_data_[i])
def test_ensemble_io(self): model_name = "ensemble_io" with self._shm_leak_detector.Probe() as shm_probe: with httpclient.InferenceServerClient("localhost:8000") as client: input0 = np.random.random([1000]).astype(np.float32) for model_1_in_gpu in [True, False]: for model_2_in_gpu in [True, False]: for model_3_in_gpu in [True, False]: gpu_output = np.asarray([ model_1_in_gpu, model_2_in_gpu, model_3_in_gpu ], dtype=bool) inputs = [ httpclient.InferInput( "INPUT0", input0.shape, np_to_triton_dtype(input0.dtype)), httpclient.InferInput( "GPU_OUTPUT", gpu_output.shape, np_to_triton_dtype(gpu_output.dtype)) ] inputs[0].set_data_from_numpy(input0) inputs[1].set_data_from_numpy(gpu_output) result = client.infer(model_name, inputs) output0 = result.as_numpy('OUTPUT0') self.assertIsNotNone(output0) self.assertTrue(np.all(output0 == input0))
def test_no_update(self): # Test implicit state without updating any state triton_client = tritonhttpclient.InferenceServerClient( "localhost:8000") inputs = [] inputs.append(tritonhttpclient.InferInput('INPUT', [1], 'INT32')) inputs.append(tritonhttpclient.InferInput('TEST_CASE', [1], 'INT32')) inputs[0].set_data_from_numpy(np.asarray([1], dtype=np.int32)) inputs[1].set_data_from_numpy(np.asarray([1], dtype=np.int32)) correlation_id = 3 # Make sure the state is never updated. result_start = triton_client.infer(model_name="no_state_update", inputs=inputs, sequence_id=correlation_id, sequence_start=True) self.assertEqual(result_start.as_numpy('OUTPUT')[0], 1) for _ in range(10): result = triton_client.infer(model_name="no_state_update", inputs=inputs, sequence_id=correlation_id) self.assertEqual(result.as_numpy('OUTPUT')[0], 1) result_start = triton_client.infer(model_name="no_state_update", inputs=inputs, sequence_id=correlation_id, sequence_end=True) self.assertEqual(result.as_numpy('OUTPUT')[0], 1)
def predict(self, features: Dict) -> Dict: if not self.triton_client: self.triton_client = httpclient.InferenceServerClient( url=self.predictor_host, verbose=True) unique_ids = np.zeros([1, 1], dtype=np.int32) segment_ids = features["segment_ids"].reshape(1, 128) input_ids = features["input_ids"].reshape(1, 128) input_mask = features["input_mask"].reshape(1, 128) inputs = [ httpclient.InferInput('unique_ids', [1, 1], "INT32"), httpclient.InferInput('segment_ids', [1, 128], "INT32"), httpclient.InferInput('input_ids', [1, 128], "INT32"), httpclient.InferInput('input_mask', [1, 128], "INT32") ] inputs[0].set_data_from_numpy(unique_ids) inputs[1].set_data_from_numpy(segment_ids) inputs[2].set_data_from_numpy(input_ids) inputs[3].set_data_from_numpy(input_mask) outputs = [ httpclient.InferRequestedOutput('start_logits', binary_data=False), httpclient.InferRequestedOutput('end_logits', binary_data=False) ] result = self.triton_client.infer(self.model_name, inputs, outputs=outputs) return result.get_response()
def run_infer(model_name, model_version, numerical_features, categorical_features, headers=None): inputs = [] outputs = [] num_type = "FP16" if numerical_features.dtype == np.float16 else "FP32" inputs.append( http_client.InferInput('input__0', numerical_features.shape, num_type)) inputs.append( http_client.InferInput('input__1', categorical_features.shape, "INT64")) # Initialize the data inputs[0].set_data_from_numpy(numerical_features, binary_data=True) inputs[1].set_data_from_numpy(categorical_features, binary_data=False) outputs.append( http_client.InferRequestedOutput('output__0', binary_data=True)) results = triton_client.infer( model_name, inputs, model_version=str(model_version) if model_version != -1 else '', outputs=outputs, headers=headers) return results
def _basic_inference(self, shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle, error_msg, big_shm_name="", big_shm_size=64): input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) inputs = [] outputs = [] if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs.append( httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32")) outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) outputs.append(grpcclient.InferRequestedOutput('OUTPUT1')) inputs[0].set_shared_memory("input0_data", 64) if type(shm_ip1_handle) == np.array: inputs[1].set_data_from_numpy(input0_data, binary_data=True) elif big_shm_name != "": inputs[1].set_shared_memory(big_shm_name, big_shm_size) else: inputs[1].set_shared_memory("input1_data", 64) outputs[0].set_shared_memory("output0_data", 64) outputs[1].set_shared_memory("output1_data", 64) try: results = triton_client.infer("simple", inputs, model_version="", outputs=outputs) output = results.get_output('OUTPUT0') if _protocol == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = utils.triton_to_np_dtype(output_datatype) output_data = shm.get_contents_as_numpy(shm_op0_handle, output_dtype, output_shape) self.assertTrue( (output_data[0] == (input0_data + input1_data)).all(), "Model output does not match expected output") except Exception as ex: error_msg.append(str(ex))
def _optional_input_infer(self, model_name, has_input0, has_input1): with httpclient.InferenceServerClient("localhost:8000") as client: shape = (1, ) if has_input0: input0_numpy = np.random.randint(0, 100, size=shape, dtype=np.int32) else: # Set the input0 to a default value if it is optional. This is # the input used by the model if it is not provided. input0_numpy = np.array([5], dtype=np.int32) if has_input1: input1_numpy = np.random.randint(0, 100, size=shape, dtype=np.int32) else: # Set the input1 to a default value if it is optional. This is # the input used by the model if it is not provided. input1_numpy = np.array([5], dtype=np.int32) inputs = [] if has_input0: inputs.append( httpclient.InferInput( "INPUT0", shape, np_to_triton_dtype(input0_numpy.dtype))) inputs[-1].set_data_from_numpy(input0_numpy) if has_input1: inputs.append( httpclient.InferInput( "INPUT1", shape, np_to_triton_dtype(input1_numpy.dtype))) inputs[-1].set_data_from_numpy(input1_numpy) result = client.infer(model_name, inputs) output0 = result.as_numpy('OUTPUT0') self.assertIsNotNone(output0, "OUTPUT0 was not found.") output1 = result.as_numpy('OUTPUT1') self.assertIsNotNone(output1, "OUTPUT1 was not found.") expected_output0 = input0_numpy + input1_numpy expected_output1 = input0_numpy - input1_numpy np.testing.assert_equal(output0, expected_output0, "OUTPUT0 doesn't match expected OUTPUT0") np.testing.assert_equal(output1, expected_output1, "OUTPUT1 doesn't match expected OUTPUT1")
def test_ensemble(self): model_name = "ensemble" shape = [16] with self._shm_leak_detector.Probe() as shm_probe: with httpclient.InferenceServerClient("localhost:8000") as client: input_data_0 = np.random.random(shape).astype(np.float32) input_data_1 = np.random.random(shape).astype(np.float32) inputs = [ httpclient.InferInput( "INPUT0", input_data_0.shape, np_to_triton_dtype(input_data_0.dtype)), httpclient.InferInput( "INPUT1", input_data_1.shape, np_to_triton_dtype(input_data_1.dtype)) ] inputs[0].set_data_from_numpy(input_data_0) inputs[1].set_data_from_numpy(input_data_1) result = client.infer(model_name, inputs) output0 = result.as_numpy('OUTPUT0') output1 = result.as_numpy('OUTPUT1') self.assertIsNotNone(output0) self.assertIsNotNone(output1) self.assertTrue(np.allclose(output0, 2 * input_data_0)) self.assertTrue(np.allclose(output1, 2 * input_data_1)) model_name = "ensemble_gpu" with self._shm_leak_detector.Probe() as shm_probe: with httpclient.InferenceServerClient("localhost:8000") as client: input_data_0 = np.random.random(shape).astype(np.float32) input_data_1 = np.random.random(shape).astype(np.float32) inputs = [ httpclient.InferInput( "INPUT0", input_data_0.shape, np_to_triton_dtype(input_data_0.dtype)), httpclient.InferInput( "INPUT1", input_data_1.shape, np_to_triton_dtype(input_data_1.dtype)) ] inputs[0].set_data_from_numpy(input_data_0) inputs[1].set_data_from_numpy(input_data_1) result = client.infer(model_name, inputs) output0 = result.as_numpy('OUTPUT0') output1 = result.as_numpy('OUTPUT1') self.assertIsNotNone(output0) self.assertIsNotNone(output1) self.assertTrue(np.allclose(output0, 2 * input_data_0)) self.assertTrue(np.allclose(output1, 2 * input_data_1))
def test_infer(self): try: triton_client = httpclient.InferenceServerClient( url="localhost:8000") except Exception as e: print("channel creation failed: " + str(e)) sys.exit(1) model_name = "libtorch_int32_int32_int32" inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT__0', [1, 16], "INT32")) inputs.append(httpclient.InferInput('INPUT__1', [1, 16], "INT32")) # Create the data for the two input tensors. Initialize the first # to unique integers and the second to all ones. input0_data = np.arange(start=0, stop=16, dtype=np.int32) input0_data = np.expand_dims(input0_data, axis=0) input1_data = np.full(shape=(1, 16), fill_value=-1, dtype=np.int32) # Initialize the data inputs[0].set_data_from_numpy(input0_data, binary_data=True) inputs[1].set_data_from_numpy(input1_data, binary_data=True) outputs.append( httpclient.InferRequestedOutput('OUTPUT__0', binary_data=True)) outputs.append( httpclient.InferRequestedOutput('OUTPUT__1', binary_data=True)) results = triton_client.infer(model_name, inputs, outputs=outputs) output0_data = results.as_numpy('OUTPUT__0') output1_data = results.as_numpy('OUTPUT__1') # Validate the results by comparing with precomputed values. for i in range(16): print( str(input0_data[0][i]) + " - " + str(input1_data[0][i]) + " = " + str(output0_data[0][i])) print( str(input0_data[0][i]) + " + " + str(input1_data[0][i]) + " = " + str(output1_data[0][i])) if (input0_data[0][i] - input1_data[0][i]) != output0_data[0][i]: print("sync infer error: incorrect difference") sys.exit(1) if (input0_data[0][i] + input1_data[0][i]) != output1_data[0][i]: print("sync infer error: incorrect sum") sys.exit(1)
def predict(self, deployment_name, df): single_input_np = None if isinstance(df, np.ndarray): single_input_np = df inputs = [] if single_input_np is not None: model_metadata = self.triton_client.get_model_metadata( deployment_name) raise MlflowException("Unnamed input is not currently supported") else: if isinstance(df, pd.DataFrame): model_metadata = self.triton_client.get_model_metadata( deployment_name) input_dtype = {} for input in model_metadata["inputs"]: input_dtype[input["name"]] = triton_to_np_dtype( input["datatype"]) # Sanity check if len(df.columns) != 1: raise MlflowException( "Expect Pandas DataFrame has only 1 column") col = df.columns[0] for row in df.index: val = df[col][row] # Need to form numpy array of the data type expected if type(df[col][row]) != np.ndarray: val = np.array(val, dtype=input_dtype[row]) inputs.append( tritonhttpclient.InferInput( row, val.shape, np_to_triton_dtype(val.dtype))) inputs[-1].set_data_from_numpy(val) else: for key, val in df: inputs.append( tritonhttpclient.InferInput( key, val.shape, np_to_triton_dtype(val.dtype))) inputs[-1].set_data_from_numpy(val) try: resp = self.triton_client.infer(model_name=deployment_name, inputs=inputs) res = {} for output in resp.get_response()['outputs']: res[output['name']] = resp.as_numpy(output['name']) return {"outputs": res} except InferenceServerException as ex: raise MlflowException(str(ex))
def requestGenerator(batched_image_data, input_name, output_name, dtype, FLAGS): # Set the input data inputs = [] if FLAGS.protocol.lower() == "grpc": inputs.append( grpcclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data) else: inputs.append( httpclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data, binary_data=True) outputs = [] if FLAGS.protocol.lower() == "grpc": outputs.append( grpcclient.InferRequestedOutput(output_name, class_count=FLAGS.classes)) else: outputs.append( httpclient.InferRequestedOutput(output_name, binary_data=True, class_count=FLAGS.classes)) yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
def test_infer_pymodel_error(self): model_name = "wrong_model" shape = [2, 2] with self._shm_leak_detector.Probe() as shm_probe: with httpclient.InferenceServerClient("localhost:8000") as client: input_data = (16384 * np.random.randn(*shape)).astype( np.uint32) inputs = [ httpclient.InferInput("IN", input_data.shape, np_to_triton_dtype(input_data.dtype)) ] inputs[0].set_data_from_numpy(input_data) try: client.infer(model_name, inputs) except InferenceServerException as e: print(e.message()) self.assertTrue( e.message().startswith( "Failed to process the request(s) for model instance" ), "Exception message is not correct") else: self.assertTrue( False, "Wrong exception raised or did not raise an exception")
def run(self, output_names, input_feed, run_options=None): inputs = [] for key, val in input_feed.items(): val = np.expand_dims(val, axis=0) input = tritonhttpclient.InferInput(key, val.shape, self.dtype_mapping[key]) input.set_data_from_numpy(val) inputs.append(input) outputs = [] for output_name in output_names: output = tritonhttpclient.InferRequestedOutput(output_name) outputs.append(output) res = self.client.async_infer(self.model_name, inputs, request_id=str(self.request_count), outputs=outputs) res = res.get_result() results = [] for output_name in output_names: results.append(res.as_numpy(output_name)) return results
def TestIdentityInference(np_array, binary_data): model_name = "savedmodel_zero_1_object" inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT0', np_array.shape, "BYTES")) inputs[0].set_data_from_numpy(np_array, binary_data=binary_data) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=binary_data)) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) if (np_array.dtype == np.object): if binary_data: if not np.array_equal(np_array, np.char.decode(results.as_numpy('OUTPUT0'))): print(results.as_numpy('OUTPUT0')) sys.exit(1) else: if not np.array_equal(np_array, results.as_numpy('OUTPUT0')): print(results.as_numpy('OUTPUT0')) sys.exit(1) else: encoded_results = np.char.encode( results.as_numpy('OUTPUT0').astype(str)) if not np.array_equal(np_array, encoded_results): print(encoded_results) sys.exit(1)
def sync_send(triton_client, result_list, values, batch_size, sequence_id, model_name, model_version): count = 1 for value in values: # Create the tensor for INPUT value_data = np.full(shape=[batch_size, 1], fill_value=value, dtype=np.int32) inputs = [] inputs.append(httpclient.InferInput('INPUT', value_data.shape, "INT32")) # Initialize the data inputs[0].set_data_from_numpy(value_data) outputs = [] outputs.append(httpclient.InferRequestedOutput('OUTPUT')) # Issue the synchronous sequence inference. result = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs, sequence_id=sequence_id, sequence_start=(count == 1), sequence_end=(count == len(values))) result_list.append(result.as_numpy('OUTPUT')) count = count + 1
def generate_rest_request_from_dictionary(self, row_dict): triton_request_inputs = [] for key, value in row_dict.items(): t = clients.utils.get_type(value, self._default_float_type, self._default_int_type) if t == np.object_: value = clients.utils.map_multi_dimensional_list( value, lambda s: s.encode("utf-8")) numpy_value = np.array(value, dtype=t) triton_request_input = triton_httpclient.InferInput( key, list(numpy_value.shape), triton_utils.np_to_triton_dtype(t)) triton_request_input.set_data_from_numpy( numpy_value, binary_data=True) # binary_data=True by default triton_request_inputs.append(triton_request_input) # https://github.com/triton-inference-server/client/blob/530bcac5f1574aa2222930076200544eb274245c/src/python/library/tritonclient/http/__init__.py#L81 # Returns tuple - request and request len to pass in Infer-Header-Content-Length header (request, json_size) = triton_httpclient._get_inference_request( inputs=triton_request_inputs, request_id="", outputs=None, sequence_id=0, sequence_start=0, sequence_end=0, priority=0, timeout=None) headers = {} if json_size: headers["Inference-Header-Content-Length"] = str(json_size) return (request, headers)
def test_incorrect_execute_return(self): model_name = 'execute_return_error' shape = [1, 1] with httpclient.InferenceServerClient("localhost:8000") as client: input_data = (5 * np.random.randn(*shape)).astype(np.float32) inputs = [ httpclient.InferInput("INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype)) ] inputs[0].set_data_from_numpy(input_data) # The first request to this model will return None. with self.assertRaises(InferenceServerException) as e: client.infer(model_name, inputs) self.assertTrue( str(e.exception).startswith( "Failed to process the request(s) for model instance " "'execute_return_error_0', message: Expected a list in the " "execute return"), "Exception message is not correct.") # The second inference request will return a list of None object # instead of Python InferenceResponse objects. with self.assertRaises(InferenceServerException) as e: client.infer(model_name, inputs) self.assertTrue( str(e.exception).startswith( "Failed to process the request(s) for model instance " "'execute_return_error_0', message: Expected an " "'InferenceResponse' object in the execute function return" " list"), "Exception message is not correct.")
def test_wrong_implicit_state_name(self): triton_client = tritonhttpclient.InferenceServerClient( "localhost:8000") inputs = [] inputs.append(tritonhttpclient.InferInput('INPUT', [1], 'INT32')) inputs.append(tritonhttpclient.InferInput('TEST_CASE', [1], 'INT32')) inputs[0].set_data_from_numpy( np.random.randint(5, size=[1], dtype=np.int32)) inputs[1].set_data_from_numpy(np.asarray([0], dtype=np.int32)) with self.assertRaises(InferenceServerException) as e: triton_client.infer(model_name="wrong_internal_state", inputs=inputs, sequence_id=2, sequence_start=True) self.assertEqual(str(e.exception), "state 'undefined_state' is not a valid state name.")
def test_predict_specified_model(self): inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) input_data = np.expand_dims(input_data, axis=0) inputs[0].set_data_from_numpy(input_data, binary_data=False) inputs[1].set_data_from_numpy(input_data, binary_data=False) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) outputs.append( httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) request_body, _ = httpclient.InferenceServerClient.generate_request_body( inputs, outputs=outputs) headers = { 'Content-Type': 'application/json', "X-Vertex-Ai-Triton-Redirect": "v2/models/{}/infer".format(self.model_) } r = requests.post(self.url_, data=request_body, headers=headers) r.raise_for_status() result = httpclient.InferenceServerClient.parse_response_body( r._content) output0_data = result.as_numpy('OUTPUT0') output1_data = result.as_numpy('OUTPUT1') if self.model_ == "addsub": expected_output0_data = [x * 2 for x in self.input_data_] expected_output1_data = [0 for x in self.input_data_] else: expected_output0_data = [0 for x in self.input_data_] expected_output1_data = [x * 2 for x in self.input_data_] for i in range(16): self.assertEqual(output0_data[0][i], expected_output0_data[i]) self.assertEqual(output1_data[0][i], expected_output1_data[i])
def test_no_implicit_state(self): triton_client = tritonhttpclient.InferenceServerClient( "localhost:8000") inputs = [] inputs.append(tritonhttpclient.InferInput('INPUT', [1], 'INT32')) inputs.append(tritonhttpclient.InferInput('TEST_CASE', [1], 'INT32')) inputs[0].set_data_from_numpy( np.random.randint(5, size=[1], dtype=np.int32)) inputs[1].set_data_from_numpy(np.asarray([0], dtype=np.int32)) with self.assertRaises(InferenceServerException) as e: triton_client.infer(model_name="no_implicit_state", inputs=inputs, sequence_id=1, sequence_start=True) self.assertEqual( str(e.exception), "unable to add state 'undefined_state'. State configuration is missing for model 'no_implicit_state'." )
def _infer_helper(self, model_name, shape, data_type): with httpclient.InferenceServerClient("localhost:8000") as client: input_data_0 = np.array(np.random.randn(*shape), dtype=data_type) inputs = [ httpclient.InferInput("INPUT0", shape, np_to_triton_dtype(input_data_0.dtype)) ] inputs[0].set_data_from_numpy(input_data_0) result = client.infer(model_name, inputs) output0 = result.as_numpy('OUTPUT0') self.assertTrue(np.all(input_data_0 == output0))
def test_http(self): triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") inputs = [] inputs.append(tritonhttpclient.InferInput('INPUT', [1], "UINT8")) inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) try: triton_client.infer(model_name="query", inputs=inputs) self.assertTrue(False, "expect error with query information") except InferenceServerException as ex: self.assertTrue("OUTPUT0 CPU 0" in ex.message()) self.assertTrue("OUTPUT1 CPU 0" in ex.message())
def send_identity_request(self, client, model_name): inputs = [] inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32")) input0_data = np.arange(start=0, stop=16, dtype=np.float32) input0_data = np.expand_dims(input0_data, axis=0) inputs[0].set_data_from_numpy(input0_data) result = client.infer( model_name=model_name, inputs=inputs, outputs=[httpclient.InferRequestedOutput('OUTPUT0')]) output_numpy = result.as_numpy('OUTPUT0') self.assertTrue(np.all(input0_data == output_numpy))
def test_infer(model_name, input0_data, input1_data, headers=None): inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) # Initialize the data inputs[0].set_data_from_numpy(input0_data, binary_data=False) inputs[1].set_data_from_numpy(input1_data, binary_data=True) outputs.append(httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs.append(httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) query_params = {'test_1': 1, 'test_2': 2} results = triton_client.infer(model_name, inputs, outputs=outputs, query_params=query_params, headers=headers) return results
def oneflow_infer(data): triton_client = httpclient.InferenceServerClient(url='127.0.0.1:8000') inputs = [] inputs.append(httpclient.InferInput('INPUT_0', data.shape, "INT64")) inputs[0].set_data_from_numpy(data, binary_data=True) outputs = [] outputs.append( httpclient.InferRequestedOutput('OUTPUT_0', binary_data=True, class_count=1)) results = triton_client.infer("embedding", inputs=inputs, outputs=outputs) output_data = results.as_numpy('OUTPUT_0') return output_data
def test_bool(self): model_name = 'identity_bool' with httpclient.InferenceServerClient("localhost:8000") as client: input_data = np.array([[True, False, True]], dtype=bool) inputs = [ httpclient.InferInput("INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype)) ] inputs[0].set_data_from_numpy(input_data) result = client.infer(model_name, inputs) output0 = result.as_numpy('OUTPUT0') self.assertTrue(output0 is not None) self.assertTrue(np.all(output0 == input_data))