Example #1
0
    def _send_bls_sequence_requests(self, correlation_id):
        # Start request
        try:
            input = pb_utils.Tensor('INPUT', np.array([1000], dtype=np.int32))

            infer_request = pb_utils.InferenceRequest(
                model_name='onnx_nobatch_sequence_int32',
                inputs=[input],
                requested_output_names=['OUTPUT'],
                flags=pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START,
                correlation_id=correlation_id)
            self.assertTrue(infer_request.flags(),
                            pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START)
            infer_response = infer_request.exec()
            self.assertFalse(infer_response.has_error())
            output = pb_utils.get_output_tensor_by_name(
                infer_response, 'OUTPUT')
            self.assertEqual(output.as_numpy()[0], input.as_numpy()[0])

            for i in range(10):
                input = pb_utils.Tensor('INPUT', np.array([i], dtype=np.int32))
                infer_request = pb_utils.InferenceRequest(
                    model_name='onnx_nobatch_sequence_int32',
                    inputs=[input],
                    requested_output_names=['OUTPUT'],
                    correlation_id=correlation_id)
                infer_response = infer_request.exec()
                self.assertFalse(infer_response.has_error())

                # The new output is the previous output + the current input
                expected_output = output.as_numpy()[0] + i
                output = pb_utils.get_output_tensor_by_name(
                    infer_response, 'OUTPUT')
                self.assertEqual(output.as_numpy()[0], expected_output)

            # Final request
            input = pb_utils.Tensor('INPUT', np.array([2000], dtype=np.int32))

            infer_request = pb_utils.InferenceRequest(
                model_name='onnx_nobatch_sequence_int32',
                inputs=[input],
                requested_output_names=['OUTPUT'],
                correlation_id=correlation_id)
            infer_request.set_flags(
                pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END)
            self.assertTrue(infer_request.flags(),
                            pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END)

            infer_response = infer_request.exec()
            self.assertFalse(infer_response.has_error())
            expected_output = output.as_numpy()[0] + input.as_numpy()[0]
            output = pb_utils.get_output_tensor_by_name(
                infer_response, 'OUTPUT')
            self.assertEqual(output.as_numpy()[0], expected_output)
        except Exception as e:
            self.add_deferred_exception(e)
Example #2
0
    def test_bls_incorrect_args(self):
        with self.assertRaises(TypeError):
            pb_utils.InferenceRequest(
                inputs=[], requested_output_names=['OUTPUT0', 'OUTPUT1'])

        with self.assertRaises(TypeError):
            pb_utils.InferenceRequest(
                model_name='add_sub',
                requested_output_names=['OUTPUT0', 'OUTPUT1'])

        with self.assertRaises(TypeError):
            pb_utils.InferenceRequest(model_name='add_sub', inputs=[])
Example #3
0
def bls_add_sub(_=None):
    input0_np = np.random.randn(*[16])
    input0_np = input0_np.astype(np.float32)
    input1_np = np.random.randn(*[16])
    input1_np = input1_np.astype(np.float32)
    input0 = pb_utils.Tensor('INPUT0', input0_np)
    input1 = pb_utils.Tensor('INPUT1', input1_np)
    infer_request = pb_utils.InferenceRequest(
        model_name='add_sub',
        inputs=[input0, input1],
        requested_output_names=['OUTPUT0', 'OUTPUT1'])
    infer_response = infer_request.exec()
    if infer_response.has_error():
        return False

    output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
    output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
    if output0 is None or output1 is None:
        return False

    expected_output_0 = input0.as_numpy() + input1.as_numpy()
    expected_output_1 = input0.as_numpy() - input1.as_numpy()

    if not np.all(expected_output_0 == output0.as_numpy()):
        return False

    if not np.all(expected_output_1 == output1.as_numpy()):
        return False

    return True
Example #4
0
 def test_bls_execute_error(self):
     # Test BLS with a model that has an error during execution.
     infer_request = pb_utils.InferenceRequest(model_name='execute_error',
                                               inputs=[],
                                               requested_output_names=[])
     infer_response = infer_request.exec()
     self.assertTrue(infer_response.has_error())
Example #5
0
    def _get_gpu_bls_outputs(self, input0_pb, input1_pb):
        """
        This function is created to test that the DLPack container works
        properly when the inference response and outputs go out of scope.

        Returns True on success and False on failure.
        """
        infer_request = pb_utils.InferenceRequest(
            model_name='dlpack_add_sub',
            inputs=[input0_pb, input1_pb],
            requested_output_names=['OUTPUT0', 'OUTPUT1'])
        infer_response = infer_request.exec()
        if infer_response.has_error():
            return False

        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
        output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
        if output0 is None or output1 is None:
            return False

        # When one of the inputs is in GPU the output returned by the model must
        # be in GPU, otherwise the outputs will be in CPU.
        if not input0_pb.is_cpu() or not input1_pb.is_cpu():
            if output0.is_cpu() or output1.is_cpu():
                return False
        else:
            if (not output0.is_cpu()) or (not output1.is_cpu()):
                return False

        # Make sure that the reference count is increased by one when DLPack
        # representation is created.
        rc_before_dlpack_output0 = sys.getrefcount(output0)
        rc_before_dlpack_output1 = sys.getrefcount(output1)

        output0_dlpack = output0.to_dlpack()
        output1_dlpack = output1.to_dlpack()

        rc_after_dlpack_output0 = sys.getrefcount(output0)
        rc_after_dlpack_output1 = sys.getrefcount(output1)

        if rc_after_dlpack_output0 - rc_before_dlpack_output0 != 1:
            return False

        if rc_after_dlpack_output1 - rc_before_dlpack_output1 != 1:
            return False

        # Make sure that reference count decreases after destroying the DLPack
        output0_dlpack = None
        output1_dlpack = None
        rc_after_del_dlpack_output0 = sys.getrefcount(output0)
        rc_after_del_dlpack_output1 = sys.getrefcount(output1)
        if rc_after_del_dlpack_output0 - rc_after_dlpack_output0 != -1:
            return False

        if rc_after_del_dlpack_output1 - rc_after_dlpack_output1 != -1:
            return False

        return output0.to_dlpack(), output1.to_dlpack()
Example #6
0
    def test_bls_wrong_inputs(self):
        input0 = pb_utils.Tensor('INPUT0', np.random.randn(*[1, 16]))

        infer_request = pb_utils.InferenceRequest(
            model_name='add_sub',
            inputs=[input0],
            requested_output_names=['OUTPUT0', 'OUTPUT1'])
        infer_response = infer_request.exec()
        self.assertTrue(infer_response.has_error())
Example #7
0
 def _send_identity_tensor(self, size):
     tensor_size = [1, size]
     input0_np = np.random.randn(*tensor_size)
     input0 = pb_utils.Tensor('INPUT0', input0_np.astype(np.float32))
     infer_request = pb_utils.InferenceRequest(
         model_name='identity_fp32',
         inputs=[input0],
         requested_output_names=['OUTPUT0'])
     return input0_np, infer_request.exec()
Example #8
0
 def test_bls_execute_error(self):
     # Test BLS with a model that has an error during execution.
     infer_request = pb_utils.InferenceRequest(model_name='execute_error',
                                               inputs=[],
                                               requested_output_names=[])
     infer_response = infer_request.exec()
     self.assertTrue(infer_response.has_error())
     self.assertEqual(
         infer_response.error().message(),
         "expected 1 inputs but got 0 inputs for model 'execute_error'")
     self.assertTrue(len(infer_response.output_tensors()) == 0)
Example #9
0
    def test_bls_wrong_inputs(self):
        input0 = pb_utils.Tensor('INPUT0', np.random.randn(*[1, 16]))

        infer_request = pb_utils.InferenceRequest(
            model_name='add_sub',
            inputs=[input0],
            requested_output_names=['OUTPUT0', 'OUTPUT1'])
        infer_response = infer_request.exec()
        self.assertTrue(infer_response.has_error())
        self.assertEqual(
            infer_response.error().message(),
            "expected 2 inputs but got 1 inputs for model 'add_sub'")
        self.assertTrue(len(infer_response.output_tensors()) == 0)
Example #10
0
    def test_zero_length_io(self):
        model_name = 'identity_fp32'
        input0 = np.zeros([1, 0], dtype=np.float32)
        input0_pb = pb_utils.Tensor('INPUT0', input0)
        infer_request = pb_utils.InferenceRequest(
            model_name=model_name,
            inputs=[input0_pb],
            requested_output_names=['OUTPUT0'])
        infer_response = infer_request.exec()
        self.assertFalse(infer_response.has_error())

        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
        self.assertTrue(np.all(output0 == input0))
Example #11
0
    def test_bls_sync(self):
        infer_request = pb_utils.InferenceRequest(
            model_name='non_existent_model',
            inputs=[],
            requested_output_names=[])
        infer_response = infer_request.exec()

        # Because the model doesn't exist, the inference response must have an
        # error
        self.assertTrue(infer_response.has_error())

        # Make sure that the inference requests can be performed properly after
        # an error.
        self.assertTrue(bls_add_sub())
Example #12
0
    def _get_gpu_bls_outputs(self, input0_pb, input1_pb):
        """
        This function is created to test that the DLPack container works
        properly when the inference response and outputs go out of scope.
        """
        infer_request = pb_utils.InferenceRequest(
            model_name='dlpack_add_sub',
            inputs=[input0_pb, input1_pb],
            requested_output_names=['OUTPUT0', 'OUTPUT1'])
        infer_response = infer_request.exec()
        self.assertFalse(infer_response.has_error())

        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
        output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
        self.assertIsNotNone(output0)
        self.assertIsNotNone(output1)

        # When one of the inputs is in GPU the output returned by the model must
        # be in GPU, otherwise the outputs will be in CPU.
        if not input0_pb.is_cpu() or not input1_pb.is_cpu():
            self.assertTrue((not output0.is_cpu()) and (not output1.is_cpu()))
        else:
            self.assertTrue((output0.is_cpu()) and (output1.is_cpu()))

        # Make sure that the reference count is increased by one when DLPack
        # representation is created.
        rc_before_dlpack_output0 = sys.getrefcount(output0)
        rc_before_dlpack_output1 = sys.getrefcount(output1)

        output0_dlpack = output0.to_dlpack()
        output1_dlpack = output1.to_dlpack()

        rc_after_dlpack_output0 = sys.getrefcount(output0)
        rc_after_dlpack_output1 = sys.getrefcount(output1)

        self.assertEqual(rc_after_dlpack_output0 - rc_before_dlpack_output0, 1)
        self.assertEqual(rc_after_dlpack_output1 - rc_before_dlpack_output1, 1)

        # Make sure that reference count decreases after destroying the DLPack
        output0_dlpack = None
        output1_dlpack = None
        rc_after_del_dlpack_output0 = sys.getrefcount(output0)
        rc_after_del_dlpack_output1 = sys.getrefcount(output1)
        self.assertEqual(rc_after_del_dlpack_output0 - rc_after_dlpack_output0,
                         -1)
        self.assertEqual(rc_after_del_dlpack_output1 - rc_after_dlpack_output1,
                         -1)

        return output0.to_dlpack(), output1.to_dlpack()
Example #13
0
    def response_thread(self, response_sender, in_input):
        # The response_sender is used to send response(s) associated with the
        # corresponding request.
        # Sleep 5 seconds to make sure the main thread has exited.
        time.sleep(5)

        status = self.execute_gpu_bls()
        if not status:
            infer_response = pb_utils.InferenceResponse(
                error="GPU BLS test failed.")
            response_sender.send(infer_response)
        else:
            in_value = in_input
            infer_request = pb_utils.InferenceRequest(
                model_name='identity_fp32',
                requested_output_names=["OUTPUT0"],
                inputs=[pb_utils.Tensor('INPUT0', in_input)])
            infer_response = infer_request.exec()
            output0 = pb_utils.get_output_tensor_by_name(
                infer_response, "OUTPUT0")
            if infer_response.has_error():
                response = pb_utils.InferenceResponse(
                    error=infer_response.error().message())
                response_sender.send(
                    response,
                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
            elif np.any(in_input != output0.as_numpy()):
                error_message = (
                    "BLS Request input and BLS response output do not match."
                    f" {in_value} != {output0.as_numpy()}")
                response = pb_utils.InferenceResponse(error=error_message)
                response_sender.send(
                    response,
                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
            else:
                output_tensors = [pb_utils.Tensor('OUT', in_value)]
                response = pb_utils.InferenceResponse(
                    output_tensors=output_tensors)
                response_sender.send(
                    response,
                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)

        with self.inflight_thread_count_lck:
            self.inflight_thread_count -= 1
Example #14
0
    def execute(self, requests):
        responses = []
        for request in requests:
            # Get INPUT0
            input0 = pb_utils.get_input_tensor_by_name(request, 'INPUT0')
            infer_request = pb_utils.InferenceRequest(
                model_name='identity',
                requested_output_names=["OUTPUT0"],
                inputs=[input0])
            infer_response = infer_request.exec()

            if infer_response.has_error():
                raise pb_utils.TritonModelException(
                    infer_response.error().message())

            inference_response = pb_utils.InferenceResponse(output_tensors=[
                pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
            ])
            responses.append(inference_response)

        return responses
Example #15
0
def create_addsub_inference_request(gpu=False):
    if not gpu:
        input0_np = np.random.randn(16)
        input1_np = np.random.randn(16)
        input0_np = input0_np.astype(np.float32)
        input1_np = input1_np.astype(np.float32)
        input0 = pb_utils.Tensor('INPUT0', input0_np)
        input1 = pb_utils.Tensor('INPUT1', input1_np)
    else:
        input0_pytorch = torch.rand(16).to('cuda')
        input1_pytorch = torch.rand(16).to('cuda')
        input0 = pb_utils.Tensor.from_dlpack('INPUT0',
                                             to_dlpack(input0_pytorch))
        input1 = pb_utils.Tensor.from_dlpack('INPUT1',
                                             to_dlpack(input1_pytorch))

    infer_request = pb_utils.InferenceRequest(
        model_name='dlpack_add_sub',
        inputs=[input0, input1],
        requested_output_names=['OUTPUT0', 'OUTPUT1'])
    return input0, input1, infer_request
Example #16
0
    def _test_gpu_bls_add_sub(self, is_input0_gpu, is_input1_gpu):
        input0 = torch.rand(16)
        input1 = torch.rand(16)

        if is_input0_gpu:
            input0 = input0.to('cuda')

        if is_input1_gpu:
            input1 = input1.to('cuda')

        input0_pb = pb_utils.Tensor.from_dlpack('INPUT0', to_dlpack(input0))
        input1_pb = pb_utils.Tensor.from_dlpack('INPUT1', to_dlpack(input1))

        infer_request = pb_utils.InferenceRequest(
            model_name='dlpack_add_sub',
            inputs=[input0_pb, input1_pb],
            requested_output_names=['OUTPUT0', 'OUTPUT1'])
        infer_response = infer_request.exec()
        self.assertFalse(infer_response.has_error())

        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
        output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
        self.assertIsNotNone(output0)
        self.assertIsNotNone(output1)

        expected_output_0 = from_dlpack(
            input0_pb.to_dlpack()).to('cpu') + from_dlpack(
                input1_pb.to_dlpack()).to('cpu')
        expected_output_1 = from_dlpack(
            input0_pb.to_dlpack()).to('cpu') - from_dlpack(
                input1_pb.to_dlpack()).to('cpu')

        self.assertTrue(
            torch.all(expected_output_0 == from_dlpack(output0.to_dlpack()).to(
                'cpu')))
        self.assertTrue(
            torch.all(expected_output_1 == from_dlpack(output1.to_dlpack()).to(
                'cpu')))
Example #17
0
    def execute(self, requests):
        """ This function is called on inference request.
        """

        # Only generate the error for the first request
        for i, request in enumerate(requests):
            request_input = pb_utils.get_input_tensor_by_name(request, 'IN')

            # Sync BLS request
            infer_request = pb_utils.InferenceRequest(
                model_name='identity_fp32',
                requested_output_names=["OUTPUT0"],
                inputs=[pb_utils.Tensor('INPUT0', request_input.as_numpy())])
            infer_response = infer_request.exec()
            if infer_response.has_error():
                raise pb_utils.TritonModelException(
                    f"BLS Response has an error: {infer_response.error().message()}"
                )

            output0 = pb_utils.get_output_tensor_by_name(
                infer_response, "OUTPUT0")
            if np.any(output0.as_numpy() != request_input.as_numpy()):
                raise pb_utils.TritonModelException(
                    f"BLS Request input and BLS response output do not match. {request_input.as_numpy()} != {output0.as_numpy()}"
                )

            thread1 = threading.Thread(target=self.response_thread,
                                       args=(request.get_response_sender(),
                                             pb_utils.get_input_tensor_by_name(
                                                 request, 'IN').as_numpy()))
            thread1.daemon = True
            with self.inflight_thread_count_lck:
                self.inflight_thread_count += 1
            thread1.start()

        return None
Example #18
0
    def batch_rescoring(self, score_hyps, hist_enc, hist_mask_len, max_len):
        """
        score_hyps: [((ctc_score, (id1, id2, id3, ....)), (), ...), ....]
        hist_enc: [len1xF, len2xF, .....]
        hist_mask: [1x1xlen1, 1x1xlen2]
        return bzx1  best_index
        """
        bz = len(hist_enc)
        f = hist_enc[0].shape[-1]
        beam_size = self.beam_size
        encoder_lens = np.zeros((bz, 1), dtype=np.int32)
        encoder_out = torch.zeros((bz, max_len, f), dtype=self.dtype)
        hyps = []
        ctc_score = torch.zeros((bz, beam_size), dtype=self.dtype)
        max_seq_len = 0
        for i in range(bz):
            cur_len = hist_enc[i].shape[0]
            encoder_out[i, 0:cur_len] = hist_enc[i]
            encoder_lens[i, 0] = hist_mask_len[i]

            # process candidate
            if len(score_hyps[i]) < beam_size:
                to_append = (beam_size - len(score_hyps[i])) * [(-10000, ())]
                score_hyps[i] = list(score_hyps[i]) + to_append
            for idx, c in enumerate(score_hyps[i]):
                score, idlist = c
                if score < -10000:
                    score = -10000
                ctc_score[i][idx] = score
                hyps.append(list(idlist))
                if len(hyps[-1]) > max_seq_len:
                    max_seq_len = len(hyps[-1])

        max_seq_len += 2
        hyps_pad_sos_eos = np.ones((bz, beam_size, max_seq_len), dtype=np.int64)
        hyps_pad_sos_eos = hyps_pad_sos_eos * self.eos  # fill eos
        if self.bidecoder:
            r_hyps_pad_sos_eos = np.ones((bz, beam_size, max_seq_len), dtype=np.int64)
            r_hyps_pad_sos_eos = r_hyps_pad_sos_eos * self.eos

        hyps_lens_sos = np.ones((bz, beam_size), dtype=np.int32)
        bz_id = 0
        for idx, cand in enumerate(hyps):
            bz_id = idx // beam_size
            length = len(cand) + 2
            bz_offset = idx % beam_size
            pad_cand = [self.sos] + cand + [self.eos]
            hyps_pad_sos_eos[bz_id][bz_offset][0 : length] = pad_cand
            if self.bidecoder:
                r_pad_cand = [self.sos] + cand[::-1] + [self.eos]
                r_hyps_pad_sos_eos[bz_id][bz_offset][0:length] = r_pad_cand
            hyps_lens_sos[bz_id][idx % beam_size] = len(cand) + 1
        in0 = pb_utils.Tensor.from_dlpack("encoder_out", to_dlpack(encoder_out))
        in1 = pb_utils.Tensor("encoder_out_lens", encoder_lens)
        in2 = pb_utils.Tensor("hyps_pad_sos_eos", hyps_pad_sos_eos)
        in3 = pb_utils.Tensor("hyps_lens_sos", hyps_lens_sos)
        input_tensors = [in0, in1, in2, in3]
        if self.bidecoder:
            in4 = pb_utils.Tensor("r_hyps_pad_sos_eos", r_hyps_pad_sos_eos)
            input_tensors.append(in4)
        in5 = pb_utils.Tensor.from_dlpack("ctc_score", to_dlpack(ctc_score))
        input_tensors.append(in5)
        request = pb_utils.InferenceRequest(model_name='decoder',
                                            requested_output_names=['best_index'],
                                            inputs=input_tensors)
        response = request.exec()
        best_index = pb_utils.get_output_tensor_by_name(response, 'best_index')
        best_index = from_dlpack(best_index.to_dlpack()).clone()
        best_index = best_index.numpy()[:, 0]
        return best_index
Example #19
0
    def execute(self, requests):
        """`execute` must be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference is requested
        for this model.

        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest

        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """

        responses = []

        # Every Python backend must iterate through list of requests and create
        # an instance of pb_utils.InferenceResponse class for each of them. You
        # should avoid storing any of the input Tensors in the class attributes
        # as they will be overridden in subsequent inference requests. You can
        # make a copy of the underlying NumPy array and store it if it is
        # required.

        batch_encoder_out, batch_encoder_lens = [], []
        batch_log_probs, batch_log_probs_idx = [], []
        batch_count = []
        batch_root = TrieVector()
        batch_start = []
        root_dict = {}

        encoder_max_len = 0
        hyps_max_len = 0
        total = 0
        for request in requests:
            # Perform inference on the request and append it to responses list...
            in_0 = pb_utils.get_input_tensor_by_name(request, "encoder_out")
            in_1 = pb_utils.get_input_tensor_by_name(request,
                                                     "encoder_out_lens")
            in_2 = pb_utils.get_input_tensor_by_name(request,
                                                     "batch_log_probs")
            in_3 = pb_utils.get_input_tensor_by_name(request,
                                                     "batch_log_probs_idx")

            batch_encoder_out.append(in_0.as_numpy())
            encoder_max_len = max(encoder_max_len,
                                  batch_encoder_out[-1].shape[1])

            cur_b_lens = in_1.as_numpy()
            batch_encoder_lens.append(cur_b_lens)
            cur_batch = cur_b_lens.shape[0]
            batch_count.append(cur_batch)

            cur_b_log_probs = in_2.as_numpy()
            cur_b_log_probs_idx = in_3.as_numpy()
            for i in range(cur_batch):
                cur_len = cur_b_lens[i]
                cur_probs = cur_b_log_probs[i][
                    0:cur_len, :].tolist()  # T X Beam
                cur_idx = cur_b_log_probs_idx[i][
                    0:cur_len, :].tolist()  # T x Beam
                batch_log_probs.append(cur_probs)
                batch_log_probs_idx.append(cur_idx)
                root_dict[total] = PathTrie()
                batch_root.append(root_dict[total])
                batch_start.append(True)
                total += 1

        score_hyps = ctc_beam_search_decoder_batch(
            batch_log_probs,
            batch_log_probs_idx,
            batch_root,
            batch_start,
            self.beam_size,
            min(total, self.num_processes),
            blank_id=self.blank_id,
            space_id=-2,
            cutoff_prob=self.cutoff_prob,
            ext_scorer=self.lm)
        all_hyps = []
        all_ctc_score = []
        max_seq_len = 0
        for seq_cand in score_hyps:
            # if candidates less than beam size
            if len(seq_cand) != self.beam_size:
                seq_cand = list(seq_cand)
                seq_cand += (self.beam_size - len(seq_cand)) * [(-float("INF"),
                                                                 (0, ))]

            for score, hyps in seq_cand:
                all_hyps.append(list(hyps))
                all_ctc_score.append(score)
                max_seq_len = max(len(hyps), max_seq_len)

        beam_size = self.beam_size
        feature_size = self.feature_size
        hyps_max_len = max_seq_len + 2
        in_ctc_score = np.zeros((total, beam_size), dtype=self.data_type)
        in_hyps_pad_sos_eos = np.ones(
            (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos
        if self.bidecoder:
            in_r_hyps_pad_sos_eos = np.ones(
                (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos

        in_hyps_lens_sos = np.ones((total, beam_size), dtype=np.int32)

        in_encoder_out = np.zeros((total, encoder_max_len, feature_size),
                                  dtype=self.data_type)
        in_encoder_out_lens = np.zeros(total, dtype=np.int32)
        st = 0
        for b in batch_count:
            t = batch_encoder_out.pop(0)
            in_encoder_out[st:st + b, 0:t.shape[1]] = t
            in_encoder_out_lens[st:st + b] = batch_encoder_lens.pop(0)
            for i in range(b):
                for j in range(beam_size):
                    cur_hyp = all_hyps.pop(0)
                    cur_len = len(cur_hyp) + 2
                    in_hyp = [self.sos] + cur_hyp + [self.eos]
                    in_hyps_pad_sos_eos[st + i][j][0:cur_len] = in_hyp
                    in_hyps_lens_sos[st + i][j] = cur_len - 1
                    if self.bidecoder:
                        r_in_hyp = [self.sos] + cur_hyp[::-1] + [self.eos]
                        in_r_hyps_pad_sos_eos[st + i][j][0:cur_len] = r_in_hyp
                    in_ctc_score[st + i][j] = all_ctc_score.pop(0)
            st += b
        in_encoder_out_lens = np.expand_dims(in_encoder_out_lens, axis=1)
        in_tensor_0 = pb_utils.Tensor("encoder_out", in_encoder_out)
        in_tensor_1 = pb_utils.Tensor("encoder_out_lens", in_encoder_out_lens)
        in_tensor_2 = pb_utils.Tensor("hyps_pad_sos_eos", in_hyps_pad_sos_eos)
        in_tensor_3 = pb_utils.Tensor("hyps_lens_sos", in_hyps_lens_sos)
        input_tensors = [in_tensor_0, in_tensor_1, in_tensor_2, in_tensor_3]
        if self.bidecoder:
            in_tensor_4 = pb_utils.Tensor("r_hyps_pad_sos_eos",
                                          in_r_hyps_pad_sos_eos)
            input_tensors.append(in_tensor_4)
        in_tensor_5 = pb_utils.Tensor("ctc_score", in_ctc_score)
        input_tensors.append(in_tensor_5)

        inference_request = pb_utils.InferenceRequest(
            model_name='decoder',
            requested_output_names=['best_index'],
            inputs=input_tensors)

        inference_response = inference_request.exec()
        if inference_response.has_error():
            raise pb_utils.TritonModelException(
                inference_response.error().message())
        else:
            # Extract the output tensors from the inference response.
            best_index = pb_utils.get_output_tensor_by_name(
                inference_response, 'best_index')
            best_index = best_index.as_numpy()
            hyps = []
            idx = 0
            for cands, cand_lens in zip(in_hyps_pad_sos_eos, in_hyps_lens_sos):
                best_idx = best_index[idx][0]
                best_cand_len = cand_lens[best_idx] - 1  # remove sos
                best_cand = cands[best_idx][1:1 + best_cand_len].tolist()
                hyps.append(best_cand)
                idx += 1

            hyps = map_batch(
                hyps, self.vocabulary,
                min(multiprocessing.cpu_count(), len(in_ctc_score)))
            st = 0
            for b in batch_count:
                sents = np.array(hyps[st:st + b])
                out0 = pb_utils.Tensor("OUTPUT0",
                                       sents.astype(self.out0_dtype))
                inference_response = pb_utils.InferenceResponse(
                    output_tensors=[out0])
                responses.append(inference_response)
                st += b
        return responses
Example #20
0
    def Execute(self, request, context):
        """Execute is called on TRITONBACKEND_ModelInstanceExecute. Inference
        happens in this function. This function mainly converts gRPC
        protobufs to the triton_python_backend_utils.InferenceRequest and
        triton_python_backend_utils.InferenceResponse.
        Parameters
        ----------
        request : python_host_pb2.ExecuteRequest
            Contains a `requests` attribute which is a list of python_host_pb2.InferenceRequest
        """

        requests = request.requests
        inference_requests = []
        for request in requests:
            # This object contains a list of tpb_utils.Tensor
            input_tensors = []
            for request_input in request.inputs:
                x = request_input
                numpy_type = tpb_utils.triton_to_numpy_type(x.dtype)

                # We need to deserialize TYPE_STRING
                if numpy_type == np.object or numpy_type == np.bytes_:
                    numpy_data = deserialize_bytes_tensor(x.raw_data)
                    tensor = tpb_utils.Tensor(x.name,
                                              numpy_data.reshape(x.dims))
                    input_tensors.append(tensor)
                else:
                    tensor = tpb_utils.Tensor(
                        x.name,
                        np.frombuffer(x.raw_data,
                                      dtype=numpy_type).reshape(x.dims))
                    input_tensors.append(tensor)

            request_id = request.id
            correlation_id = request.correlation_id
            requested_output_names = request.requested_output_names
            inference_request = tpb_utils.InferenceRequest(
                input_tensors, request_id, correlation_id,
                requested_output_names)
            inference_requests.append(inference_request)

        # Execute inference on the Python backend responses contains a list of
        # triton_python_backend_utils.InferenceResponse. Each backend must
        # implement an execute method
        if not hasattr(self.backend, 'execute'):
            context.set_code(grpc.StatusCode.INTERNAL)
            context.set_details('Backend does not implement `execute` method')
            return ExecuteResponse()

        responses = self.backend.execute(inference_requests)

        # Make sure that number of InferenceResponse and InferenceRequest
        # objects match
        if len(inference_requests) != len(responses):
            context.set_code(grpc.StatusCode.INTERNAL)
            context.set_details(
                'Number of inference responses and requests don\'t match ( requests='
                + len(inference_requests) + ' != responses=' + len(responses) +
                ')')
            return ExecuteResponse()

        exec_responses = []
        for response in responses:
            # If there is an error do not look into output_tensors
            if response.has_error():
                error = Error(message=response.error().message())
                inference_response = InferenceResponse(outputs=[],
                                                       error=error,
                                                       failed=True)
                exec_responses.append(inference_response)
                continue

            output_tensors = response.output_tensors()
            response_tensors = []

            for output_tensor in output_tensors:
                output_np_array = output_tensor.as_numpy()
                output_shape = output_np_array.shape

                # We need to serialize TYPE_STRING
                if output_np_array.dtype.type is np.object or output_np_array.dtype.type is np.bytes_:
                    output_np_array = serialize_byte_tensor(output_np_array)

                tensor = Tensor(name=output_tensor.name(),
                                dtype=tpb_utils.numpy_to_triton_type(
                                    output_np_array.dtype.type),
                                dims=output_shape,
                                raw_data=output_np_array.tobytes())

                response_tensors.append(tensor)
            exec_responses.append(InferenceResponse(outputs=response_tensors))
        execute_response = ExecuteResponse(responses=exec_responses)

        return execute_response
Example #21
0
    def test_infer_request_args(self):
        # Dummy arguments used in the tests.
        inputs = [
            pb_utils.Tensor('INPUT0', np.asarray([1, 2], dtype=np.int32))
        ]
        model_name = 'my_model'
        requested_output_names = ['my_output']

        #
        # inputs field validation
        #

        # Test list of None as inputs
        with self.assertRaises(pb_utils.TritonModelException) as e:
            pb_utils.InferenceRequest(
                inputs=[None],
                model_name=model_name,
                requested_output_names=requested_output_names)

        # Test None object as list of inputs
        with self.assertRaises(TypeError) as e:
            pb_utils.InferenceRequest(
                inputs=None,
                model_name=model_name,
                requested_output_names=requested_output_names)

        # model_name validation
        with self.assertRaises(TypeError) as e:
            pb_utils.InferenceRequest(
                model_name=None,
                inputs=inputs,
                requested_output_names=requested_output_names)

        #
        # Requested output name validations
        #

        # Test list of None objects as requested_output_names
        with self.assertRaises(TypeError) as e:
            pb_utils.InferenceRequest(requested_output_names=[None],
                                      inputs=inputs,
                                      model_name=model_name)

        with self.assertRaises(TypeError) as e:
            pb_utils.InferenceRequest(requested_output_names=None,
                                      inputs=inputs,
                                      model_name=model_name)

        # Other arguments validation

        # correlation_id set to None
        with self.assertRaises(TypeError) as e:
            pb_utils.InferenceRequest(
                requested_output_names=requested_output_names,
                inputs=inputs,
                model_name=model_name,
                correleation_id=None)

        # request_id set to None
        with self.assertRaises(TypeError) as e:
            pb_utils.InferenceRequest(
                requested_output_names=requested_output_names,
                inputs=inputs,
                model_name=model_name,
                request_id=None)

        # model_version set to None
        with self.assertRaises(TypeError) as e:
            pb_utils.InferenceRequest(
                requested_output_names=requested_output_names,
                inputs=inputs,
                model_name=model_name,
                model_version=None)

        # flags set to None
        with self.assertRaises(TypeError) as e:
            pb_utils.InferenceRequest(
                requested_output_names=requested_output_names,
                inputs=inputs,
                model_name=model_name,
                flags=None)

        # Empty lists should not raise an exception
        pb_utils.InferenceRequest(requested_output_names=[],
                                  inputs=[],
                                  model_name=model_name)
Example #22
0
    def Execute(self, request, context):
        """Execute is called on TRITONBACKEND_ModelInstanceExecute. Inference
        happens in this function. This function mainly converts gRPC
        protobufs to the triton_python_backend_utils.InferenceRequest and
        triton_python_backend_utils.InferenceResponse.

        Parameters
        ----------
        request : python_host_pb2.ExecuteRequest
            Contains a `requests` attribute which is a list of python_host_pb2.InferenceRequest
        """

        requests = request.requests
        inference_requests = []
        for request in requests:
            # This object contains a list of tpb_utils.Tensor
            input_tensors = []
            for request_input in request.inputs:
                x = request_input
                tensor = tpb_utils.Tensor(
                    x.name,
                    np.frombuffer(x.raw_data,
                                  dtype=protobuf_to_numpy_type(
                                      x.dtype)).reshape(x.dims))
                input_tensors.append(tensor)

            request_id = request.id
            correlation_id = request.correlation_id
            requested_output_names = request.requested_output_names
            inference_request = tpb_utils.InferenceRequest(
                input_tensors, request_id, correlation_id,
                requested_output_names)
            inference_requests.append(inference_request)

        # Execute inference on the Python backend responses contains a list of
        # triton_python_backend_utils.InferenceResponse
        responses = self.backend(inference_requests)

        # Make sure that number of InferenceResponse and InferenceRequest
        # objects match
        if len(inference_requests) != len(responses):
            context.set_code(grpc.StatusCode.INTERNAL)
            context.set_details(
                'Number of inference responses and requests don\'t match ( requests='
                + len(inference_requests) + ' != responses=' + len(responses) +
                ')')
            return ExecuteResponse()

        exec_responses = []
        for response in responses:
            output_tensors = response.output_tensors()
            response_tensors = []

            for output_tensor in output_tensors:
                output_np_array = output_tensor.numpy_array()
                tensor = Tensor(name=output_tensor.name(),
                                dtype=numpy_to_protobuf_type(
                                    output_np_array.dtype.type),
                                dims=output_np_array.shape,
                                raw_data=output_np_array.tobytes())
                response_tensors.append(tensor)
            exec_responses.append(InferenceResponse(outputs=response_tensors))
        execute_response = ExecuteResponse(responses=exec_responses)

        return execute_response
    def execute(self, requests):
        """`execute` must be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference request is made
        for this model. Depending on the batching configuration (e.g. Dynamic
        Batching) used, `requests` may contain multiple requests. Every
        Python model, must create one pb_utils.InferenceResponse for every
        pb_utils.InferenceRequest in `requests`. If there is an error, you can
        set the error argument when creating a pb_utils.InferenceResponse

        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest

        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """

        responses = []
        # Every Python backend must iterate over everyone of the requests
        # and create a pb_utils.InferenceResponse for each of them.
        for request in requests:
            # Get INPUT0
            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")

            # Get INPUT1
            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")

            # Get Model Name
            model_name = pb_utils.get_input_tensor_by_name(
                request, "MODEL_NAME")

            # Model Name string
            model_name_string = model_name.as_numpy()[0]

            # Create inference request object
            infer_request = pb_utils.InferenceRequest(
                model_name=model_name_string,
                requested_output_names=["OUTPUT0", "OUTPUT1"],
                inputs=[in_0, in_1])

            # Perform synchronous blocking inference request
            infer_response = infer_request.exec()

            # Make sure that the inference response doesn't have an error. If
            # it has an error and you can't proceed with your model execution
            # you can raise an exception.
            if infer_response.has_error():
                raise pb_utils.TritonModelException(
                    infer_response.error().message())

            # Create InferenceResponse. You can set an error here in case
            # there was a problem with handling this inference request.
            # Below is an example of how you can set errors in inference
            # response:
            #
            # pb_utils.InferenceResponse(
            #    output_tensors=..., TritonError("An error occured"))
            #
            # Because the infer_response of the models contains the final
            # outputs with correct output names, we can just pass the list
            # of outputs to the InferenceResponse object.
            inference_response = pb_utils.InferenceResponse(
                output_tensors=infer_response.output_tensors())
            responses.append(inference_response)

        # You should return a list of pb_utils.InferenceResponse. Length
        # of this list must match the length of `requests` list.
        return responses
Example #24
0
    def execute(self, requests):
        responses = []
        for request in requests:
            input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
            gpu_output = pb_utils.get_input_tensor_by_name(
                request, "GPU_OUTPUT").as_numpy()

            if input0.is_cpu():
                if not gpu_output[0]:
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", input0.to_dlpack())
                else:
                    outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda()
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", to_dlpack(outptu0_pytorch))
            else:
                if gpu_output[0]:
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", input0.to_dlpack())
                else:
                    outptu0_pytorch = from_dlpack(input0.to_dlpack()).cpu()
                    output0 = pb_utils.Tensor.from_dlpack(
                        "OUTPUT0", to_dlpack(outptu0_pytorch))

            next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT",
                                              gpu_output[1:])

            # Do not perform BLS inference if it is the first
            # model in the pipeline.
            if self._model_name != 'dlpack_io_identity_1':
                infer_request = pb_utils.InferenceRequest(
                    model_name='dlpack_io_identity_1',
                    inputs=[
                        input0,
                        pb_utils.get_input_tensor_by_name(
                            request, "GPU_OUTPUT")
                    ],
                    requested_output_names=['OUTPUT0'])
                infer_response = infer_request.exec()

                if infer_response.has_error():
                    raise pb_utils.TritonModelException(
                        infer_response.error().message())

                bls_output0 = pb_utils.get_output_tensor_by_name(
                    infer_response, 'OUTPUT0')
                if not output0.is_cpu():
                    bls_output0 = from_dlpack(
                        bls_output0.to_dlpack()).detach().cpu().numpy()
                else:
                    bls_output0 = bls_output0.as_numpy()

                if not input0.is_cpu():
                    input0 = from_dlpack(
                        input0.to_dlpack()).detach().cpu().numpy()
                else:
                    input0 = input0.as_numpy()

                if not np.allclose(bls_output0, input0):
                    raise pb_utils.TritonModelException(
                        'BLS input and output tensors are not equal')

            responses.append(
                pb_utils.InferenceResponse([output0, next_gpu_output]))

        return responses
    async def execute(self, requests):
        """`execute` must be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference request is made
        for this model. Depending on the batching configuration (e.g. Dynamic
        Batching) used, `requests` may contain multiple requests. Every
        Python model, must create one pb_utils.InferenceResponse for every
        pb_utils.InferenceRequest in `requests`. If there is an error, you can
        set the error argument when creating a pb_utils.InferenceResponse

        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest

        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """

        responses = []
        # Every Python backend must iterate over everyone of the requests
        # and create a pb_utils.InferenceResponse for each of them.
        for request in requests:
            # Get INPUT0
            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")

            # Get INPUT1
            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")

            # List of awaitables containing inflight inference responses.
            inference_response_awaits = []
            for model_name in ['pytorch', 'add_sub']:
                # Create inference request object
                infer_request = pb_utils.InferenceRequest(
                    model_name=model_name,
                    requested_output_names=["OUTPUT0", "OUTPUT1"],
                    inputs=[in_0, in_1])

                # Store the awaitable inside the array. We don't need
                # the inference response immediately so we do not `await`
                # here.
                inference_response_awaits.append(infer_request.async_exec())

            # Wait for all the inference requests to finish. The execution
            # of the Python script will be blocked until all the awaitables
            # are resolved.
            inference_responses = await asyncio.gather(
                *inference_response_awaits)

            for infer_response in inference_responses:
                # Make sure that the inference response doesn't have an error.
                # If it has an error and you can't proceed with your model
                # execution you can raise an exception.
                if infer_response.has_error():
                    raise pb_utils.TritonModelException(
                        infer_response.error().message())

            # Get the OUTPUT0 from the "pytorch" model inference resposne
            pytorch_output0_tensor = pb_utils.get_output_tensor_by_name(
                inference_responses[0], "OUTPUT0")

            # Get the OUTPUT1 from the "addsub" model inference resposne
            addsub_output1_tensor = pb_utils.get_output_tensor_by_name(
                inference_responses[1], "OUTPUT1")

            # Create InferenceResponse. You can set an error here in case
            # there was a problem with handling this inference request.
            # Below is an example of how you can set errors in inference
            # response:
            #
            # pb_utils.InferenceResponse(
            #    output_tensors=..., TritonError("An error occured"))
            #
            # Because the infer_response of the models contains the final
            # outputs with correct output names, we can just pass the list
            # of outputs to the InferenceResponse object.
            inference_response = pb_utils.InferenceResponse(
                output_tensors=[pytorch_output0_tensor, addsub_output1_tensor])
            responses.append(inference_response)

        # You should return a list of pb_utils.InferenceResponse. Length
        # of this list must match the length of `requests` list.
        return responses
Example #26
0
    def test_bls_tensor_lifecycle(self):
        model_name = 'dlpack_identity'

        # A 10 MB tensor.
        input_size = 10 * 1024 * 1024

        # Sending the tensor 50 times to test whether the deallocation is
        # happening correctly. If the deallocation doesn't happen correctly,
        # there will be an out of shared memory error.
        for _ in range(50):
            input0 = np.ones([1, input_size], dtype=np.float32)
            input0_pb = pb_utils.Tensor('INPUT0', input0)
            infer_request = pb_utils.InferenceRequest(
                model_name=model_name,
                inputs=[input0_pb],
                requested_output_names=['OUTPUT0'])
            infer_response = infer_request.exec()
            self.assertFalse(infer_response.has_error())

            output0 = pb_utils.get_output_tensor_by_name(
                infer_response, 'OUTPUT0')
            np.testing.assert_equal(output0.as_numpy(), input0,
                                    "BLS CPU memory lifecycle failed.")

        # Checking the same with the GPU tensors.
        for index in range(50):
            input0 = None
            infer_request = None
            input0_pb = None

            torch.cuda.empty_cache()
            free_memory, _ = torch.cuda.mem_get_info()
            if index == 1:
                recorded_memory = free_memory

            if index > 1:
                self.assertEqual(free_memory, recorded_memory,
                                 "GPU memory lifecycle test failed.")

            input0 = torch.ones([1, input_size],
                                dtype=torch.float32).to('cuda')
            input0_pb = pb_utils.Tensor.from_dlpack('INPUT0',
                                                    to_dlpack(input0))
            infer_request = pb_utils.InferenceRequest(
                model_name=model_name,
                inputs=[input0_pb],
                requested_output_names=['OUTPUT0'])
            infer_response = infer_request.exec()
            self.assertFalse(infer_response.has_error())

            output0 = pb_utils.get_output_tensor_by_name(
                infer_response, 'OUTPUT0')
            output0_pytorch = from_dlpack(output0.to_dlpack())

            # Set inference response and output0_pytorch to None, to make sure
            # that the DLPack is still valid.
            output0 = None
            infer_response = None
            self.assertTrue(
                torch.all(output0_pytorch == input0),
                f"input ({input0}) and output ({output0_pytorch}) didn't match for identity model."
            )