Esempio n. 1
0
 def recognize(self, wav_file, idx=0):
     waveform, sample_rate = sf.read(wav_file)
     samples = np.array([waveform], dtype=np.float32)
     lengths = np.array([[len(waveform)]], dtype=np.int32)
     # better pad waveform to nearest length here
     # target_seconds = math.cel(len(waveform) / sample_rate)
     # target_samples = np.zeros([1, target_seconds  * sample_rate])
     # target_samples[0][0: len(waveform)] = waveform
     # samples = target_samples
     sequence_id = 10086 + idx
     result = ''
     inputs = [
         self.protocol_client.InferInput("WAV", samples.shape,
                                         np_to_triton_dtype(samples.dtype)),
         self.protocol_client.InferInput("WAV_LENS", lengths.shape,
                                         np_to_triton_dtype(lengths.dtype))
     ]
     inputs[0].set_data_from_numpy(samples)
     inputs[1].set_data_from_numpy(lengths)
     outputs = [self.protocol_client.InferRequestedOutput("TRANSCRIPTS")]
     response = self.triton_client.infer(self.model_name,
                                         inputs,
                                         request_id=str(sequence_id),
                                         outputs=outputs)
     result = response.as_numpy("TRANSCRIPTS")[0].decode("utf-8")
     return [result]
Esempio n. 2
0
    def recognize(self, wav_file, idx=0):
        waveform, sample_rate = sf.read(wav_file)
        wav_segs = []
        i = 0
        while i < len(waveform):
            if i == 0:
                stride = int(self.first_chunk_in_secs * sample_rate)
                wav_segs.append(waveform[i:i + stride])
            else:
                stride = int(self.other_chunk_in_secs * sample_rate)
                wav_segs.append(waveform[i:i + stride])
            i += len(wav_segs[-1])

        sequence_id = idx + 10086
        # simulate streaming
        for idx, seg in enumerate(wav_segs):
            chunk_len = len(seg)
            if idx == 0:
                chunk_samples = int(self.first_chunk_in_secs * sample_rate)
                expect_input = np.zeros((1, chunk_samples), dtype=np.float32)
            else:
                chunk_samples = int(self.other_chunk_in_secs * sample_rate)
                expect_input = np.zeros((1, chunk_samples), dtype=np.float32)

            expect_input[0][0:chunk_len] = seg
            input0_data = expect_input
            input1_data = np.array([[chunk_len]], dtype=np.int32)

            inputs = [
                self.protocol_client.InferInput(
                    "WAV", input0_data.shape,
                    np_to_triton_dtype(input0_data.dtype)),
                self.protocol_client.InferInput(
                    "WAV_LENS", input1_data.shape,
                    np_to_triton_dtype(input1_data.dtype))
            ]

            inputs[0].set_data_from_numpy(input0_data)
            inputs[1].set_data_from_numpy(input1_data)

            outputs = [
                self.protocol_client.InferRequestedOutput("TRANSCRIPTS")
            ]
            end = False
            if idx == len(wav_segs) - 1:
                end = True

            response = self.triton_client.infer(self.model_name,
                                                inputs,
                                                outputs=outputs,
                                                sequence_id=sequence_id,
                                                sequence_start=idx == 0,
                                                sequence_end=end)
            idx += 1
            result = response.as_numpy("TRANSCRIPTS")[0].decode("utf-8")
            print("Get response from {}th chunk: {}".format(idx, result))
        return [result]
Esempio n. 3
0
    def predict(self, deployment_name, df):
        single_input_np = None
        if isinstance(df, np.ndarray):
            single_input_np = df

        inputs = []
        if single_input_np is not None:
            model_metadata = self.triton_client.get_model_metadata(
                deployment_name)
            raise MlflowException("Unnamed input is not currently supported")
        else:
            if isinstance(df, pd.DataFrame):
                model_metadata = self.triton_client.get_model_metadata(
                    deployment_name)
                input_dtype = {}
                for input in model_metadata["inputs"]:
                    input_dtype[input["name"]] = triton_to_np_dtype(
                        input["datatype"])
                # Sanity check
                if len(df.columns) != 1:
                    raise MlflowException(
                        "Expect Pandas DataFrame has only 1 column")
                col = df.columns[0]
                for row in df.index:
                    val = df[col][row]
                    # Need to form numpy array of the data type expected
                    if type(df[col][row]) != np.ndarray:
                        val = np.array(val, dtype=input_dtype[row])
                    inputs.append(
                        tritonhttpclient.InferInput(
                            row, val.shape, np_to_triton_dtype(val.dtype)))
                    inputs[-1].set_data_from_numpy(val)
            else:
                for key, val in df:
                    inputs.append(
                        tritonhttpclient.InferInput(
                            key, val.shape, np_to_triton_dtype(val.dtype)))
                    inputs[-1].set_data_from_numpy(val)

        try:
            resp = self.triton_client.infer(model_name=deployment_name,
                                            inputs=inputs)
            res = {}
            for output in resp.get_response()['outputs']:
                res[output['name']] = resp.as_numpy(output['name'])
            return {"outputs": res}
        except InferenceServerException as ex:
            raise MlflowException(str(ex))
Esempio n. 4
0
def _convert_column_to_triton_input(col,
                                    name,
                                    input_class=grpcclient.InferInput):
    col = col.reshape(len(col), 1)
    input_tensor = input_class(name, col.shape, np_to_triton_dtype(col.dtype))
    input_tensor.set_data_from_numpy(col)
    return input_tensor
Esempio n. 5
0
    def run(self, client_metadata):
        trial = self.get_trial()
        model_name = tu.get_zero_model_name(trial, 1, self.input_dtype_)
        triton_client = client_metadata[0]
        input_name = self.input_name_
        if "librotch" in trial:
            input_name = "INPUT__0"

        tensor_shape = (math.trunc(1 * (1024 * 1024 * 1024) //
                                   np.dtype(self.input_dtype_).itemsize), )
        in0 = np.random.random(tensor_shape).astype(self.input_dtype_)
        inputs = [
            grpcclient.InferInput(input_name, tensor_shape,
                                  np_to_triton_dtype(self.input_dtype_)),
        ]
        inputs[0].set_data_from_numpy(in0)

        # Expect an exception for small timeout values.
        try:
            triton_client.infer(model_name, inputs, client_timeout=0.1)
            assert False, "expected inference failure from deadline exceeded"
        except Exception as ex:
            if "Deadline Exceeded" not in ex.message():
                assert False, "timeout_client failed {}".format(self.name_)
            # Expect timeout error as success case
            return 1
  def generate_rest_request_from_dictionary(self, row_dict):
    triton_request_inputs = []
    for key, value in row_dict.items():
      t = clients.utils.get_type(value, self._default_float_type,
                                 self._default_int_type)
      if t == np.object_:
        value = clients.utils.map_multi_dimensional_list(
            value, lambda s: s.encode("utf-8"))
      numpy_value = np.array(value, dtype=t)
      triton_request_input = triton_httpclient.InferInput(
          key, list(numpy_value.shape), triton_utils.np_to_triton_dtype(t))
      triton_request_input.set_data_from_numpy(
          numpy_value, binary_data=True)  # binary_data=True by default
      triton_request_inputs.append(triton_request_input)
    # https://github.com/triton-inference-server/client/blob/530bcac5f1574aa2222930076200544eb274245c/src/python/library/tritonclient/http/__init__.py#L81
    # Returns tuple - request and request len to pass in Infer-Header-Content-Length header
    (request, json_size) = triton_httpclient._get_inference_request(
        inputs=triton_request_inputs,
        request_id="",
        outputs=None,
        sequence_id=0,
        sequence_start=0,
        sequence_end=0,
        priority=0,
        timeout=None)

    headers = {}
    if json_size:
      headers["Inference-Header-Content-Length"] = str(json_size)
    return (request, headers)
Esempio n. 7
0
def convert_df_to_triton_input(column_names,
                               batch,
                               input_class=httpclient.InferInput):
    columns = [(col, batch[col]) for col in column_names]
    inputs = [
        input_class(name, col.shape, np_to_triton_dtype(col.dtype))
        for name, col in columns
    ]
    for i, (name, col) in enumerate(columns):
        inputs[i].set_data_from_numpy(col.values_host)
    return inputs
Esempio n. 8
0
def _run_query(
    client,
    n_rows,
    model_name,
    workflow_path,
    data_path,
    actual_output_filename,
    output_name,
    input_cols_name=None,
    backend="tensorflow",
):

    workflow = nvt.Workflow.load(workflow_path)

    if input_cols_name is None:
        batch = cudf.read_csv(
            data_path, nrows=n_rows)[workflow.output_node.input_columns.names]
    else:
        batch = cudf.read_csv(data_path, nrows=n_rows)[input_cols_name]

    input_dtypes = workflow.input_dtypes
    columns = [(col, batch[col]) for col in batch.columns]

    inputs = []
    for i, (name, col) in enumerate(columns):
        d = col.values_host.astype(input_dtypes[name])
        d = d.reshape(len(d), 1)
        inputs.append(
            grpcclient.InferInput(name, d.shape,
                                  np_to_triton_dtype(input_dtypes[name])))
        inputs[i].set_data_from_numpy(d)

    outputs = [grpcclient.InferRequestedOutput(output_name)]
    time_start = dt.datetime.now()
    response = client.infer(model_name,
                            inputs,
                            request_id="1",
                            outputs=outputs)
    run_time = dt.datetime.now() - time_start

    output_key = "output" if backend == "hugectr" else "0"

    output_actual = cudf.read_csv(os.path.expanduser(actual_output_filename),
                                  nrows=n_rows)
    output_actual = cp.asnumpy(output_actual[output_key].values)
    output_predict = response.as_numpy(output_name)

    if backend == "tensorflow":
        output_predict = output_predict[:, 0]

    diff = abs(output_actual - output_predict)
    return diff, run_time
Esempio n. 9
0
def test_nvt_hugectr_inference(n_rows, err_tol):
    warnings.simplefilter("ignore")

    model_name = "test_model_ens"
    col_names = ["userId", "movieId", "new_cat1"]
    # read in a batch of data to get transforms for
    batch = cudf.read_csv(DATA_DIR + "test/data.csv", nrows=n_rows)[col_names]

    # convert the batch to a triton inputs
    columns = [(col, batch[col]) for col in col_names]
    inputs = []

    col_dtypes = [np.int64, np.int64, np.int64]
    for i, (name, col) in enumerate(columns):
        d = col.values_host.astype(col_dtypes[i])
        d = d.reshape(len(d), 1)
        inputs.append(
            httpclient.InferInput(name, d.shape,
                                  np_to_triton_dtype(col_dtypes[i])))
        inputs[i].set_data_from_numpy(d)

    # placeholder variables for the output
    outputs = []
    outputs.append(httpclient.InferRequestedOutput("OUTPUT0"))
    # make the request
    with httpclient.InferenceServerClient("localhost:8001") as client:
        response = client.infer(model_name,
                                inputs,
                                request_id=str(1),
                                outputs=outputs)

    output_actual = cudf.read_csv(DATA_DIR + "test/output.csv", nrows=n_rows)
    output_actual = cp.asnumpy(output_actual["output"].values)
    output_predict = response.as_numpy("OUTPUT0")

    diff = abs(output_actual - output_predict)

    assert (diff < err_tol).all()
Esempio n. 10
0
    def test_content_encoding_unsupported_client(self):
        for encoding in ["chunked", "compress", "deflate", "gzip"]:
            with self.subTest(encoding=encoding):
                headers = {"Transfer-Encoding": encoding}
                np_input = np.arange(8, dtype=np.float32).reshape(1, -1)
                model = "onnx_zero_1_float32"
                # Setup inputs
                inputs = []
                inputs.append(
                    tritonhttpclient.InferInput(
                        'INPUT0', np_input.shape,
                        np_to_triton_dtype(np_input.dtype)))
                inputs[0].set_data_from_numpy(np_input)

                with tritonhttpclient.InferenceServerClient(
                        "localhost:8000") as client:
                    # Python client is expected to raise an exception to reject
                    # 'content-encoding' HTTP headers.
                    with self.assertRaisesRegex(InferenceServerException,
                                                "Unsupported HTTP header"):
                        client.infer(model_name=model,
                                     inputs=inputs,
                                     headers=headers)
Esempio n. 11
0
def crashing_client(model_name,
                    dtype,
                    tensor_shape,
                    shm_name,
                    triton_client,
                    input_name="INPUT0"):
    in0 = np.random.random(tensor_shape).astype(dtype)
    if "libtorch" in model_name:
        input_name = "INPUT__0"
    inputs = [
        grpcclient.InferInput(input_name, tensor_shape,
                              np_to_triton_dtype(dtype)),
    ]
    inputs[0].set_data_from_numpy(in0)

    # Run in a loop so that it is guaranteed that
    # the inference will not have completed when being terminated.
    while True:
        existing_shm = shared_memory.SharedMemory(shm_name)
        count = np.ndarray((1, ), dtype=np.int32, buffer=existing_shm.buf)
        count[0] += 1
        existing_shm.close()
        results = triton_client.infer(model_name, inputs)
    def recognize(self, audio_signal, filenames):
        # Send requests of FLAGS.batch_size audio signals. If the number of
        # audios isn't an exact multiple of FLAGS.batch_size then just
        # start over with the first audio until the batch is filled.

        input_batch = []
        input_filenames = []
        max_num_samples_batch = 0

        for idx in range(self.batch_size):
            input_batch.append(audio_signal[idx].astype(
                self.audio_signals_type))
            input_filenames.append(filenames[idx])
            num_samples = audio_signal[idx].shape[0]

            if (num_samples > max_num_samples_batch):
                max_num_samples_batch = num_samples

        for idx in range(self.batch_size):
            num_samples = input_batch[idx].shape[0]

            mean = np.mean(input_batch[idx])
            std_var = np.std(input_batch[idx])
            gauss_noise = np.random.normal(mean, std_var,
                                           max_num_samples_batch - num_samples)

            input_batch[idx] = np.concatenate(
                (input_batch[idx],
                 gauss_noise.astype(self.audio_signals_type)))

        max_num_samples_batch = np.asarray([max_num_samples_batch],
                                           dtype=self.num_samples_type)

        num_samples_batch = [max_num_samples_batch] * self.batch_size

        # Send request
        print("Sending request to transcribe file(s):",
              ",".join(input_filenames))

        inputs = []

        input_batch = np.asarray(input_batch)
        num_samples_batch = np.asarray(num_samples_batch)

        inputs.append(
            self.prtcl_client.InferInput(
                self.audio_signals_name, input_batch.shape,
                np_to_triton_dtype(input_batch.dtype)))
        inputs.append(
            self.prtcl_client.InferInput(
                self.num_samples_name, num_samples_batch.shape,
                np_to_triton_dtype(num_samples_batch.dtype)))

        if self.prtcl_client is tritonclient.grpc:
            inputs[0].set_data_from_numpy(input_batch)
            inputs[1].set_data_from_numpy(num_samples_batch)
        else:  # http
            inputs[0].set_data_from_numpy(input_batch, binary_data=True)
            inputs[1].set_data_from_numpy(num_samples_batch, binary_data=True)

        outputs = []
        if self.prtcl_client is tritonclient.grpc:
            outputs.append(
                self.prtcl_client.InferRequestedOutput(self.transcripts_name))
        else:
            outputs.append(
                self.prtcl_client.InferRequestedOutput(self.transcripts_name,
                                                       binary_data=True))

        triton_result = self.triton_client.infer(self.model_name,
                                                 inputs=inputs,
                                                 outputs=outputs)
        transcripts = triton_result.as_numpy(self.transcripts_name)

        result = self.postprocess(transcripts, input_filenames)

        return result
    # Run the libtorch_visionop model, which depends on a torchvision custom operation
    model_name = FLAGS.model

    # Create the inference context for the model.
    client = client_util.InferenceServerClient(FLAGS.url,
                                               verbose=FLAGS.verbose)

    # Create the data for the input tensors.
    input_data = np.random.rand(1, 3, 10, 10).astype(np.float32)
    box_data = np.array([[1, 1, 2, 3, 4]]).astype(np.float32)

    inputs = []
    inputs.append(
        client_util.InferInput("INPUT__0", input_data.shape,
                               np_to_triton_dtype(input_data.dtype)))
    inputs[0].set_data_from_numpy(input_data)
    inputs.append(
        client_util.InferInput("INPUT__1", box_data.shape,
                               np_to_triton_dtype(box_data.dtype)))
    inputs[1].set_data_from_numpy(box_data)

    results = client.infer(model_name, inputs)

    # We expect 1 result of shape [1, 3, 5, 5].
    output_data = results.as_numpy('OUTPUT__0')
    if output_data is None:
        print("error: expected 'OUTPUT__0'")
        sys.exit(1)

    if (output_data.shape != (1, 3, 5, 5)):
Esempio n. 14
0
    def check_sequence_async(self,
                             client_metadata,
                             trial,
                             model_name,
                             input_dtype,
                             steps,
                             timeout_ms=DEFAULT_TIMEOUT_MS,
                             batch_size=1,
                             sequence_name="<unknown>",
                             tensor_shape=(1, ),
                             input_name="INPUT",
                             output_name="OUTPUT"):
        """Perform sequence of inferences using async run. The 'steps' holds
        a list of tuples, one for each inference with format:

        (flag_str, value, expected_result, delay_ms)

        """
        if (("savedmodel" not in trial) and ("graphdef" not in trial)
                and ("custom" not in trial) and ("onnx" not in trial)
                and ("libtorch" not in trial) and ("plan" not in trial)):
            assert False, "unknown trial type: " + trial

        if "nobatch" not in trial:
            tensor_shape = (batch_size, ) + tensor_shape
        if "libtorch" in trial:
            input_name = "INPUT__0"
            output_name = "OUTPUT__0"

        triton_client = client_metadata[0]
        sequence_id = client_metadata[1]

        # Execute the sequence of inference...
        seq_start_ms = int(round(time.time() * 1000))
        user_data = SequenceScenario.UserData()
        # Ensure there is no running stream
        triton_client.stop_stream()
        triton_client.start_stream(partial(completion_callback, user_data))

        sent_count = 0
        for flag_str, value, _, delay_ms in steps:
            seq_start = False
            seq_end = False
            if flag_str is not None:
                seq_start = ("start" in flag_str)
                seq_end = ("end" in flag_str)

            if input_dtype == np.object_:
                in0 = np.full(tensor_shape, value, dtype=np.int32)
                in0n = np.array([str(x) for x in in0.reshape(in0.size)],
                                dtype=object)
                in0 = in0n.reshape(tensor_shape)
            else:
                in0 = np.full(tensor_shape, value, dtype=input_dtype)

            inputs = [
                grpcclient.InferInput(input_name, tensor_shape,
                                      np_to_triton_dtype(input_dtype)),
            ]
            inputs[0].set_data_from_numpy(in0)

            triton_client.async_stream_infer(model_name,
                                             inputs,
                                             sequence_id=sequence_id,
                                             sequence_start=seq_start,
                                             sequence_end=seq_end)
            sent_count += 1

            if delay_ms is not None:
                time.sleep(delay_ms / 1000.0)

        # Process the results in order that they were sent
        result = None
        processed_count = 0
        while processed_count < sent_count:
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error

            (_, value, expected, _) = steps[processed_count]
            processed_count += 1
            if timeout_ms != None:
                now_ms = int(round(time.time() * 1000))
                if (now_ms - seq_start_ms) > timeout_ms:
                    raise TimeoutException(
                        "Timeout expired for {}, got {} ms".format(
                            sequence_name, (now_ms - seq_start_ms)))

            result = results.as_numpy(
                output_name)[0] if "nobatch" in trial else results.as_numpy(
                    output_name)[0][0]
            if self.verbose_:
                print("{} {}: + {} = {}".format(sequence_name, sequence_id,
                                                value, result),
                      file=self.out_stream_)

            if expected is not None:
                if input_dtype == np.object_:
                    assert int(
                        result
                    ) == expected, "{}: expected result {}, got {} {} {}".format(
                        sequence_name, expected, int(result), trial,
                        model_name)
                else:
                    assert result == expected, "{}: expected result {}, got {} {} {}".format(
                        sequence_name, expected, result, trial, model_name)
        triton_client.stop_stream()
        return sent_count