def recognize(self, wav_file, idx=0): waveform, sample_rate = sf.read(wav_file) samples = np.array([waveform], dtype=np.float32) lengths = np.array([[len(waveform)]], dtype=np.int32) # better pad waveform to nearest length here # target_seconds = math.cel(len(waveform) / sample_rate) # target_samples = np.zeros([1, target_seconds * sample_rate]) # target_samples[0][0: len(waveform)] = waveform # samples = target_samples sequence_id = 10086 + idx result = '' inputs = [ self.protocol_client.InferInput("WAV", samples.shape, np_to_triton_dtype(samples.dtype)), self.protocol_client.InferInput("WAV_LENS", lengths.shape, np_to_triton_dtype(lengths.dtype)) ] inputs[0].set_data_from_numpy(samples) inputs[1].set_data_from_numpy(lengths) outputs = [self.protocol_client.InferRequestedOutput("TRANSCRIPTS")] response = self.triton_client.infer(self.model_name, inputs, request_id=str(sequence_id), outputs=outputs) result = response.as_numpy("TRANSCRIPTS")[0].decode("utf-8") return [result]
def recognize(self, wav_file, idx=0): waveform, sample_rate = sf.read(wav_file) wav_segs = [] i = 0 while i < len(waveform): if i == 0: stride = int(self.first_chunk_in_secs * sample_rate) wav_segs.append(waveform[i:i + stride]) else: stride = int(self.other_chunk_in_secs * sample_rate) wav_segs.append(waveform[i:i + stride]) i += len(wav_segs[-1]) sequence_id = idx + 10086 # simulate streaming for idx, seg in enumerate(wav_segs): chunk_len = len(seg) if idx == 0: chunk_samples = int(self.first_chunk_in_secs * sample_rate) expect_input = np.zeros((1, chunk_samples), dtype=np.float32) else: chunk_samples = int(self.other_chunk_in_secs * sample_rate) expect_input = np.zeros((1, chunk_samples), dtype=np.float32) expect_input[0][0:chunk_len] = seg input0_data = expect_input input1_data = np.array([[chunk_len]], dtype=np.int32) inputs = [ self.protocol_client.InferInput( "WAV", input0_data.shape, np_to_triton_dtype(input0_data.dtype)), self.protocol_client.InferInput( "WAV_LENS", input1_data.shape, np_to_triton_dtype(input1_data.dtype)) ] inputs[0].set_data_from_numpy(input0_data) inputs[1].set_data_from_numpy(input1_data) outputs = [ self.protocol_client.InferRequestedOutput("TRANSCRIPTS") ] end = False if idx == len(wav_segs) - 1: end = True response = self.triton_client.infer(self.model_name, inputs, outputs=outputs, sequence_id=sequence_id, sequence_start=idx == 0, sequence_end=end) idx += 1 result = response.as_numpy("TRANSCRIPTS")[0].decode("utf-8") print("Get response from {}th chunk: {}".format(idx, result)) return [result]
def predict(self, deployment_name, df): single_input_np = None if isinstance(df, np.ndarray): single_input_np = df inputs = [] if single_input_np is not None: model_metadata = self.triton_client.get_model_metadata( deployment_name) raise MlflowException("Unnamed input is not currently supported") else: if isinstance(df, pd.DataFrame): model_metadata = self.triton_client.get_model_metadata( deployment_name) input_dtype = {} for input in model_metadata["inputs"]: input_dtype[input["name"]] = triton_to_np_dtype( input["datatype"]) # Sanity check if len(df.columns) != 1: raise MlflowException( "Expect Pandas DataFrame has only 1 column") col = df.columns[0] for row in df.index: val = df[col][row] # Need to form numpy array of the data type expected if type(df[col][row]) != np.ndarray: val = np.array(val, dtype=input_dtype[row]) inputs.append( tritonhttpclient.InferInput( row, val.shape, np_to_triton_dtype(val.dtype))) inputs[-1].set_data_from_numpy(val) else: for key, val in df: inputs.append( tritonhttpclient.InferInput( key, val.shape, np_to_triton_dtype(val.dtype))) inputs[-1].set_data_from_numpy(val) try: resp = self.triton_client.infer(model_name=deployment_name, inputs=inputs) res = {} for output in resp.get_response()['outputs']: res[output['name']] = resp.as_numpy(output['name']) return {"outputs": res} except InferenceServerException as ex: raise MlflowException(str(ex))
def _convert_column_to_triton_input(col, name, input_class=grpcclient.InferInput): col = col.reshape(len(col), 1) input_tensor = input_class(name, col.shape, np_to_triton_dtype(col.dtype)) input_tensor.set_data_from_numpy(col) return input_tensor
def run(self, client_metadata): trial = self.get_trial() model_name = tu.get_zero_model_name(trial, 1, self.input_dtype_) triton_client = client_metadata[0] input_name = self.input_name_ if "librotch" in trial: input_name = "INPUT__0" tensor_shape = (math.trunc(1 * (1024 * 1024 * 1024) // np.dtype(self.input_dtype_).itemsize), ) in0 = np.random.random(tensor_shape).astype(self.input_dtype_) inputs = [ grpcclient.InferInput(input_name, tensor_shape, np_to_triton_dtype(self.input_dtype_)), ] inputs[0].set_data_from_numpy(in0) # Expect an exception for small timeout values. try: triton_client.infer(model_name, inputs, client_timeout=0.1) assert False, "expected inference failure from deadline exceeded" except Exception as ex: if "Deadline Exceeded" not in ex.message(): assert False, "timeout_client failed {}".format(self.name_) # Expect timeout error as success case return 1
def generate_rest_request_from_dictionary(self, row_dict): triton_request_inputs = [] for key, value in row_dict.items(): t = clients.utils.get_type(value, self._default_float_type, self._default_int_type) if t == np.object_: value = clients.utils.map_multi_dimensional_list( value, lambda s: s.encode("utf-8")) numpy_value = np.array(value, dtype=t) triton_request_input = triton_httpclient.InferInput( key, list(numpy_value.shape), triton_utils.np_to_triton_dtype(t)) triton_request_input.set_data_from_numpy( numpy_value, binary_data=True) # binary_data=True by default triton_request_inputs.append(triton_request_input) # https://github.com/triton-inference-server/client/blob/530bcac5f1574aa2222930076200544eb274245c/src/python/library/tritonclient/http/__init__.py#L81 # Returns tuple - request and request len to pass in Infer-Header-Content-Length header (request, json_size) = triton_httpclient._get_inference_request( inputs=triton_request_inputs, request_id="", outputs=None, sequence_id=0, sequence_start=0, sequence_end=0, priority=0, timeout=None) headers = {} if json_size: headers["Inference-Header-Content-Length"] = str(json_size) return (request, headers)
def convert_df_to_triton_input(column_names, batch, input_class=httpclient.InferInput): columns = [(col, batch[col]) for col in column_names] inputs = [ input_class(name, col.shape, np_to_triton_dtype(col.dtype)) for name, col in columns ] for i, (name, col) in enumerate(columns): inputs[i].set_data_from_numpy(col.values_host) return inputs
def _run_query( client, n_rows, model_name, workflow_path, data_path, actual_output_filename, output_name, input_cols_name=None, backend="tensorflow", ): workflow = nvt.Workflow.load(workflow_path) if input_cols_name is None: batch = cudf.read_csv( data_path, nrows=n_rows)[workflow.output_node.input_columns.names] else: batch = cudf.read_csv(data_path, nrows=n_rows)[input_cols_name] input_dtypes = workflow.input_dtypes columns = [(col, batch[col]) for col in batch.columns] inputs = [] for i, (name, col) in enumerate(columns): d = col.values_host.astype(input_dtypes[name]) d = d.reshape(len(d), 1) inputs.append( grpcclient.InferInput(name, d.shape, np_to_triton_dtype(input_dtypes[name]))) inputs[i].set_data_from_numpy(d) outputs = [grpcclient.InferRequestedOutput(output_name)] time_start = dt.datetime.now() response = client.infer(model_name, inputs, request_id="1", outputs=outputs) run_time = dt.datetime.now() - time_start output_key = "output" if backend == "hugectr" else "0" output_actual = cudf.read_csv(os.path.expanduser(actual_output_filename), nrows=n_rows) output_actual = cp.asnumpy(output_actual[output_key].values) output_predict = response.as_numpy(output_name) if backend == "tensorflow": output_predict = output_predict[:, 0] diff = abs(output_actual - output_predict) return diff, run_time
def test_nvt_hugectr_inference(n_rows, err_tol): warnings.simplefilter("ignore") model_name = "test_model_ens" col_names = ["userId", "movieId", "new_cat1"] # read in a batch of data to get transforms for batch = cudf.read_csv(DATA_DIR + "test/data.csv", nrows=n_rows)[col_names] # convert the batch to a triton inputs columns = [(col, batch[col]) for col in col_names] inputs = [] col_dtypes = [np.int64, np.int64, np.int64] for i, (name, col) in enumerate(columns): d = col.values_host.astype(col_dtypes[i]) d = d.reshape(len(d), 1) inputs.append( httpclient.InferInput(name, d.shape, np_to_triton_dtype(col_dtypes[i]))) inputs[i].set_data_from_numpy(d) # placeholder variables for the output outputs = [] outputs.append(httpclient.InferRequestedOutput("OUTPUT0")) # make the request with httpclient.InferenceServerClient("localhost:8001") as client: response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) output_actual = cudf.read_csv(DATA_DIR + "test/output.csv", nrows=n_rows) output_actual = cp.asnumpy(output_actual["output"].values) output_predict = response.as_numpy("OUTPUT0") diff = abs(output_actual - output_predict) assert (diff < err_tol).all()
def test_content_encoding_unsupported_client(self): for encoding in ["chunked", "compress", "deflate", "gzip"]: with self.subTest(encoding=encoding): headers = {"Transfer-Encoding": encoding} np_input = np.arange(8, dtype=np.float32).reshape(1, -1) model = "onnx_zero_1_float32" # Setup inputs inputs = [] inputs.append( tritonhttpclient.InferInput( 'INPUT0', np_input.shape, np_to_triton_dtype(np_input.dtype))) inputs[0].set_data_from_numpy(np_input) with tritonhttpclient.InferenceServerClient( "localhost:8000") as client: # Python client is expected to raise an exception to reject # 'content-encoding' HTTP headers. with self.assertRaisesRegex(InferenceServerException, "Unsupported HTTP header"): client.infer(model_name=model, inputs=inputs, headers=headers)
def crashing_client(model_name, dtype, tensor_shape, shm_name, triton_client, input_name="INPUT0"): in0 = np.random.random(tensor_shape).astype(dtype) if "libtorch" in model_name: input_name = "INPUT__0" inputs = [ grpcclient.InferInput(input_name, tensor_shape, np_to_triton_dtype(dtype)), ] inputs[0].set_data_from_numpy(in0) # Run in a loop so that it is guaranteed that # the inference will not have completed when being terminated. while True: existing_shm = shared_memory.SharedMemory(shm_name) count = np.ndarray((1, ), dtype=np.int32, buffer=existing_shm.buf) count[0] += 1 existing_shm.close() results = triton_client.infer(model_name, inputs)
def recognize(self, audio_signal, filenames): # Send requests of FLAGS.batch_size audio signals. If the number of # audios isn't an exact multiple of FLAGS.batch_size then just # start over with the first audio until the batch is filled. input_batch = [] input_filenames = [] max_num_samples_batch = 0 for idx in range(self.batch_size): input_batch.append(audio_signal[idx].astype( self.audio_signals_type)) input_filenames.append(filenames[idx]) num_samples = audio_signal[idx].shape[0] if (num_samples > max_num_samples_batch): max_num_samples_batch = num_samples for idx in range(self.batch_size): num_samples = input_batch[idx].shape[0] mean = np.mean(input_batch[idx]) std_var = np.std(input_batch[idx]) gauss_noise = np.random.normal(mean, std_var, max_num_samples_batch - num_samples) input_batch[idx] = np.concatenate( (input_batch[idx], gauss_noise.astype(self.audio_signals_type))) max_num_samples_batch = np.asarray([max_num_samples_batch], dtype=self.num_samples_type) num_samples_batch = [max_num_samples_batch] * self.batch_size # Send request print("Sending request to transcribe file(s):", ",".join(input_filenames)) inputs = [] input_batch = np.asarray(input_batch) num_samples_batch = np.asarray(num_samples_batch) inputs.append( self.prtcl_client.InferInput( self.audio_signals_name, input_batch.shape, np_to_triton_dtype(input_batch.dtype))) inputs.append( self.prtcl_client.InferInput( self.num_samples_name, num_samples_batch.shape, np_to_triton_dtype(num_samples_batch.dtype))) if self.prtcl_client is tritonclient.grpc: inputs[0].set_data_from_numpy(input_batch) inputs[1].set_data_from_numpy(num_samples_batch) else: # http inputs[0].set_data_from_numpy(input_batch, binary_data=True) inputs[1].set_data_from_numpy(num_samples_batch, binary_data=True) outputs = [] if self.prtcl_client is tritonclient.grpc: outputs.append( self.prtcl_client.InferRequestedOutput(self.transcripts_name)) else: outputs.append( self.prtcl_client.InferRequestedOutput(self.transcripts_name, binary_data=True)) triton_result = self.triton_client.infer(self.model_name, inputs=inputs, outputs=outputs) transcripts = triton_result.as_numpy(self.transcripts_name) result = self.postprocess(transcripts, input_filenames) return result
# Run the libtorch_visionop model, which depends on a torchvision custom operation model_name = FLAGS.model # Create the inference context for the model. client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) # Create the data for the input tensors. input_data = np.random.rand(1, 3, 10, 10).astype(np.float32) box_data = np.array([[1, 1, 2, 3, 4]]).astype(np.float32) inputs = [] inputs.append( client_util.InferInput("INPUT__0", input_data.shape, np_to_triton_dtype(input_data.dtype))) inputs[0].set_data_from_numpy(input_data) inputs.append( client_util.InferInput("INPUT__1", box_data.shape, np_to_triton_dtype(box_data.dtype))) inputs[1].set_data_from_numpy(box_data) results = client.infer(model_name, inputs) # We expect 1 result of shape [1, 3, 5, 5]. output_data = results.as_numpy('OUTPUT__0') if output_data is None: print("error: expected 'OUTPUT__0'") sys.exit(1) if (output_data.shape != (1, 3, 5, 5)):
def check_sequence_async(self, client_metadata, trial, model_name, input_dtype, steps, timeout_ms=DEFAULT_TIMEOUT_MS, batch_size=1, sequence_name="<unknown>", tensor_shape=(1, ), input_name="INPUT", output_name="OUTPUT"): """Perform sequence of inferences using async run. The 'steps' holds a list of tuples, one for each inference with format: (flag_str, value, expected_result, delay_ms) """ if (("savedmodel" not in trial) and ("graphdef" not in trial) and ("custom" not in trial) and ("onnx" not in trial) and ("libtorch" not in trial) and ("plan" not in trial)): assert False, "unknown trial type: " + trial if "nobatch" not in trial: tensor_shape = (batch_size, ) + tensor_shape if "libtorch" in trial: input_name = "INPUT__0" output_name = "OUTPUT__0" triton_client = client_metadata[0] sequence_id = client_metadata[1] # Execute the sequence of inference... seq_start_ms = int(round(time.time() * 1000)) user_data = SequenceScenario.UserData() # Ensure there is no running stream triton_client.stop_stream() triton_client.start_stream(partial(completion_callback, user_data)) sent_count = 0 for flag_str, value, _, delay_ms in steps: seq_start = False seq_end = False if flag_str is not None: seq_start = ("start" in flag_str) seq_end = ("end" in flag_str) if input_dtype == np.object_: in0 = np.full(tensor_shape, value, dtype=np.int32) in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(tensor_shape) else: in0 = np.full(tensor_shape, value, dtype=input_dtype) inputs = [ grpcclient.InferInput(input_name, tensor_shape, np_to_triton_dtype(input_dtype)), ] inputs[0].set_data_from_numpy(in0) triton_client.async_stream_infer(model_name, inputs, sequence_id=sequence_id, sequence_start=seq_start, sequence_end=seq_end) sent_count += 1 if delay_ms is not None: time.sleep(delay_ms / 1000.0) # Process the results in order that they were sent result = None processed_count = 0 while processed_count < sent_count: (results, error) = user_data._completed_requests.get() if error is not None: raise error (_, value, expected, _) = steps[processed_count] processed_count += 1 if timeout_ms != None: now_ms = int(round(time.time() * 1000)) if (now_ms - seq_start_ms) > timeout_ms: raise TimeoutException( "Timeout expired for {}, got {} ms".format( sequence_name, (now_ms - seq_start_ms))) result = results.as_numpy( output_name)[0] if "nobatch" in trial else results.as_numpy( output_name)[0][0] if self.verbose_: print("{} {}: + {} = {}".format(sequence_name, sequence_id, value, result), file=self.out_stream_) if expected is not None: if input_dtype == np.object_: assert int( result ) == expected, "{}: expected result {}, got {} {} {}".format( sequence_name, expected, int(result), trial, model_name) else: assert result == expected, "{}: expected result {}, got {} {} {}".format( sequence_name, expected, result, trial, model_name) triton_client.stop_stream() return sent_count