def test_multi_batch_not_preferred_different_shape(self):
        # Send two requests with total static batch size in between
        # preferred sizes. Then send a request with a different shape
        # and a non-preferred batch size. This should cause the first
        # two requests to be immediately responded to and the third
        # response to be delayed by the max batch queue delay.
        for trial in _trials:
            try:
                url = "localhost:8000"
                protocol = ProtocolType.HTTP
                model_name = tu.get_model_name(trial, np.float32, np.float32,
                                               np.float32)

                self.check_setup(url, protocol, model_name)
                self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ)

                threads = []
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 1, (3000, None))))
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 3, (3000, None))))
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 1,
                                           (_max_queue_delay_ms * 1.5,
                                            _max_queue_delay_ms)),
                                     kwargs={'input_size': 8}))
                threads[0].start()
                threads[1].start()
                time.sleep(1)
                threads[2].start()
                for t in threads:
                    t.join()
                self.check_deferred_exception()
                self.check_status(url, protocol, model_name, (1, 3), 2, 5)
            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))
Esempio n. 2
0
    def test_multi_batch_delayed_sum_gt_max_preferred(self):
        # Send two requests with first not having preferred size and
        # second being smaller than max preferred size but the sum of
        # the requests being larger than max preferred size. Use
        # TRTSERVER_DELAY_SCHEDULER in the environment so that
        # requests can be queued up before scheduler starts
        # servicing. This should cause first response to be returned
        # immediately but the second response, since it alone is not
        # greater than max preferred size, will be delayed.
        for trial in _trials:
            try:
                url = "localhost:8000"
                protocol = ProtocolType.HTTP
                model_name = tu.get_model_name(trial, np.float32, np.float32,
                                               np.float32)

                self.check_setup(url, protocol, model_name)

                # Need scheduler to wait for queue to contain 2 requests
                self.assertTrue("TRTSERVER_DELAY_SCHEDULER" in os.environ)
                self.assertEqual(int(os.environ["TRTSERVER_DELAY_SCHEDULER"]),
                                 2)

                threads = []
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 3, True, 3000)))
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 4, False, _max_queue_delay)))
                threads[0].start()
                time.sleep(1)
                threads[1].start()
                for t in threads:
                    t.join()
                self.check_deferred_exception()
                self.check_status(url, protocol, model_name, (3, 4), 2, 7)
            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))
Esempio n. 3
0
    def test_multi_batch_use_biggest_preferred(self):
        # Send multiple requests that sum to multiple preferred sizes
        # and make sure the largest preferred size if used for the
        # batch. Requires TRTSERVER_DELAY_SCHEDULER in the environment
        # so that requests can be queued up before scheduler starts
        # servicing.
        for trial in _trials:
            try:
                url = "localhost:8000"
                protocol = ProtocolType.HTTP
                model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32)

                self.check_setup(url, protocol, model_name)

                # Need scheduler to wait for queue to contain 6 request
                self.assertTrue("TRTSERVER_DELAY_SCHEDULER" in os.environ)
                self.assertEqual(int(os.environ["TRTSERVER_DELAY_SCHEDULER"]), 6)

                threads = []
                threads.append(threading.Thread(target=self.check_response,
                                                args=(trial, 1, True, 3000)))
                threads.append(threading.Thread(target=self.check_response,
                                                args=(trial, 1, True, 3000)))
                threads.append(threading.Thread(target=self.check_response,
                                                args=(trial, 1, True, 3000)))
                threads.append(threading.Thread(target=self.check_response,
                                                args=(trial, 1, True, 3000)))
                threads.append(threading.Thread(target=self.check_response,
                                                args=(trial, 1, True, 3000)))
                threads.append(threading.Thread(target=self.check_response,
                                                args=(trial, 1, True, 3000)))
                for t in threads:
                    t.start()
                for t in threads:
                    t.join()
                self.check_deferred_exception()
                self.check_status(url, protocol, model_name, (1,), 1, 6)
            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))
    def test_parse_error_modelfail(self):
        # --strict-readiness=true so server is live but not ready
        input_size = 16
        tensor_shape = (input_size,)

        # Server was started but with a model that fails to load
        try:
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                model_name = tu.get_model_name('graphdef', np.float32, np.float32, np.float32)
                ctx = ServerStatusContext(pair[0], pair[1], model_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)
                uptime = ss.uptime_ns
                self.assertGreater(uptime, 0)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(model_name in ss.model_status,
                                "expected status for model " + model_name)
                for (k, v) in iteritems(ss.model_status[model_name].version_status):
                    self.assertEqual(v.ready_state, server_status.MODEL_UNAVAILABLE)

                hctx = ServerHealthContext(pair[0], pair[1], True)
                self.assertFalse(hctx.is_ready())
                self.assertTrue(hctx.is_live())

        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        try:
            iu.infer_exact(self, 'graphdef', tensor_shape, 1,
                           np.float32, np.float32, np.float32)
            self.assertTrue(False, "expected error for unavailable model " + model_name)
        except InferenceServerException as ex:
            self.assertEqual("inference:0", ex.server_id())
            self.assertTrue(
                ex.message().startswith(
                    "Inference request for unknown model 'graphdef_float32_float32_float32'"))
    def test_batch_request_for_nobatching_model(self):
        input_size = 16

        # graphdef_nobatch_int32_int8_int8 is non batching version.
        # The server should return an error if the batch size dimension 
        # is included in the shape
        tensor_shape = (1, input_size)
        for protocol in ["http", "grpc"]:
            model_name = tu.get_model_name("graphdef_nobatch", np.int32, np.int8, np.int8)
            in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
            in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)

            inputs = []
            outputs = []
            if protocol == "http":
                triton_client = tritonhttpclient.InferenceServerClient(url='localhost:8000', verbose=True)
                inputs.append(tritonhttpclient.InferInput('INPUT0', tensor_shape , "INT32"))
                inputs.append(tritonhttpclient.InferInput('INPUT1', tensor_shape, "INT32"))
                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1'))
            else:
                triton_client = tritongrpcclient.InferenceServerClient(url='localhost:8001', verbose=True)
                inputs.append(tritongrpcclient.InferInput('INPUT0', tensor_shape , "INT32"))
                inputs.append(tritongrpcclient.InferInput('INPUT1', tensor_shape, "INT32"))
                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))

            # Initialize the data
            inputs[0].set_data_from_numpy(in0)
            inputs[1].set_data_from_numpy(in1)

            try:
                results = triton_client.infer(model_name,
                                  inputs,
                                  outputs=outputs)
                self.assertTrue(False, "expected failure with batched request for non-batching model")
            except InferenceServerException as ex:
                pass
    def test_multi_batch_use_best_preferred(self):
        # Send multiple requests where the initial ones sum to a
        # preferred size and then extra request goes beyond that. The
        # initial requests should be handled immediately at the
        # preferred batch size and then the other one after
        # timeout. Use TRTSERVER_DELAY_SCHEDULER in the environment so
        # that requests can be queued up before scheduler starts
        # servicing.
        for trial in _trials:
            try:
                url = "localhost:8000"
                protocol = ProtocolType.HTTP
                model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32)

                self.check_setup(url, protocol, model_name)

                # Need scheduler to wait for queue to contain 3 requests
                self.assertTrue("TRTSERVER_DELAY_SCHEDULER" in os.environ)
                self.assertEqual(int(os.environ["TRTSERVER_DELAY_SCHEDULER"]), 3)

                threads = []
                threads.append(threading.Thread(target=self.check_response,
                                                args=(trial, 1, (3000, None))))
                threads.append(threading.Thread(target=self.check_response,
                                                args=(trial, 1, (3000, None))))
                threads.append(threading.Thread(target=self.check_response,
                                                args=(trial, 1,
                                                      (_max_queue_delay_ms * 1.5, _max_queue_delay_ms))))
                threads[0].start()
                threads[1].start()
                time.sleep(1)
                threads[2].start()
                for t in threads:
                    t.join()
                self.check_deferred_exception()
                self.check_status(url, protocol, model_name, (1,), 2, 3)
            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))
    def test_multi_batch_sum_gt_max_preferred(self):
        # Send two requests with first not having preferred size and
        # second being smaller than max preferred size but the sum of
        # the requests being larger than max preferred size. Delay the
        # second request so that it arrives after the first is already
        # be processed by the dynamic batcher. This should cause first
        # response to be returned immediately but the second response,
        # since it alone is not greater than max preferred size, will
        # be delayed.
        for trial in _trials:
            try:
                url = "localhost:8000"
                protocol = ProtocolType.HTTP
                model_name = tu.get_model_name(trial, np.float32, np.float32,
                                               np.float32)

                self.check_setup(url, protocol, model_name)
                self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ)

                threads = []
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 3, (3000, None))))
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 4,
                                           (_max_queue_delay_ms * 1.5,
                                            _max_queue_delay_ms))))
                threads[0].start()
                time.sleep(1)
                threads[1].start()
                for t in threads:
                    t.join()
                self.check_deferred_exception()
                self.check_status(url, protocol, model_name, (3, 4), 2, 7)
            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))
    def test_multi_batch_different_shape(self):
        # Send two requests with sum of static batch sizes ==
        # preferred size, but with different shapes (using model with
        # variable-size tensors). This should cause the requests to
        # not be batched. The first response will come back
        # immediately and the second delayed by the max batch queue
        # delay
        for trial in _trials:
            try:
                url = "localhost:8000"
                protocol = ProtocolType.HTTP
                model_name = tu.get_model_name(trial, np.float32, np.float32,
                                               np.float32)

                self.check_setup(url, protocol, model_name)
                self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ)

                threads = []
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 1, (3000, None)),
                                     kwargs={'input_size': 16}))
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 1,
                                           (_max_queue_delay_ms * 1.5,
                                            _max_queue_delay_ms)),
                                     kwargs={'input_size': 8}))
                threads[0].start()
                time.sleep(1)
                threads[1].start()
                for t in threads:
                    t.join()
                self.check_deferred_exception()
                self.check_status(url, protocol, model_name, (1, ), 2, 2)
            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))
Esempio n. 9
0
    def test_bs2_request_for_non_batching_model(self):
        input_size = 16
        tensor_shape = (input_size, )

        # graphdef_int32_int8_int8 has a non-batching version. If we
        # make a batch-size two (or greater) request for that model it
        # should fail.
        for protocol, url in ((ProtocolType.HTTP, 'localhost:8000'),
                              (ProtocolType.GRPC, 'localhost:8001')):
            model_name = tu.get_model_name("graphdef_nobatch", np.int32,
                                           np.int8, np.int8)
            in0 = np.random.randint(low=0,
                                    high=100,
                                    size=tensor_shape,
                                    dtype=np.int32)
            in1 = np.random.randint(low=0,
                                    high=100,
                                    size=tensor_shape,
                                    dtype=np.int32)

            try:
                ctx = InferContext(url, protocol, model_name, None, True)
                results = ctx.run({
                    'INPUT0': (in0, ),
                    'INPUT1': (in1, )
                }, {
                    'OUTPUT0': InferContext.ResultFormat.RAW,
                    'OUTPUT1': InferContext.ResultFormat.RAW
                }, 2)
                self.assertTrue(
                    False,
                    "expected failure with batch-size 2 for non-batching model"
                )

            except InferenceServerException as ex:
                pass
Esempio n. 10
0
    def _erroneous_infer(self, tensor_shape, batch_size):
        import tritonhttpclient
        item_size = batch_size
        for dim in tensor_shape:
            item_size *= dim
        full_shape = (batch_size, ) + tensor_shape
        input_np = np.arange(item_size, dtype=self.dtype_).reshape(full_shape)
        expected_output0_np = input_np + input_np
        expected_output1_np = input_np - input_np

        inputs = []
        inputs.append(
            tritonhttpclient.InferInput('INPUT0', full_shape, self.dtype_str_))
        inputs[-1].set_data_from_numpy(input_np)
        inputs.append(
            tritonhttpclient.InferInput('INPUT1', full_shape, self.dtype_str_))
        inputs[-1].set_data_from_numpy(input_np)
        outputs = []
        outputs.append(
            tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
        outputs.append(
            tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True))

        model_name = tu.get_model_name(self.model_name_, self.dtype_,
                                       self.dtype_, self.dtype_)
        results = tritonhttpclient.InferenceServerClient(
            "localhost:8000", verbose=True).infer(model_name=model_name,
                                                  inputs=inputs,
                                                  outputs=outputs)
        # Validate the results by comparing with precomputed values.
        output0_np = results.as_numpy('OUTPUT0')
        output1_np = results.as_numpy('OUTPUT1')
        self.assertFalse(np.array_equal(output0_np, expected_output0_np),
                         "expects OUTPUT0 is not correct")
        self.assertFalse(np.array_equal(output1_np, expected_output1_np),
                         "expects OUTPUT1 is not correct")
    def test_multi_batch_not_preferred(self):
        # Send two requests with total static batch size in between
        # preferred sizes. This should cause the first response to be
        # delayed by the max batch queue delay, and the second by max
        # delay (minus the difference in time that they arrived in the
        # queue)
        for trial in _trials:
            try:
                url = "localhost:8000"
                protocol = ProtocolType.HTTP
                model_name = tu.get_model_name(trial, np.float32, np.float32,
                                               np.float32)

                self.check_setup(url, protocol, model_name)
                self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ)

                threads = []
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 1,
                                           (_max_queue_delay_ms * 1.5,
                                            _max_queue_delay_ms))))
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 3,
                                           (_max_queue_delay_ms * 1.5,
                                            _max_queue_delay_ms - 2000))))
                threads[0].start()
                time.sleep(1)
                threads[1].start()
                for t in threads:
                    t.join()
                self.check_deferred_exception()
                self.check_status(url, protocol, model_name, (1, 3), 1, 4)
            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))
Esempio n. 12
0
def infer_exact(tester,
                pf,
                tensor_shape,
                batch_size,
                input_dtype,
                output0_dtype,
                output1_dtype,
                output0_raw=True,
                output1_raw=True,
                model_version=None,
                swap=False,
                outputs=("OUTPUT0", "OUTPUT1"),
                use_http=True,
                use_grpc=True,
                skip_request_id_check=False,
                use_streaming=True,
                correlation_id=0,
                shm_region_names=None,
                precreated_shm_regions=None,
                use_system_shared_memory=False,
                use_cuda_shared_memory=False):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", ProtocolType.HTTP, False))
    if use_grpc:
        configs.append(("localhost:8001", ProtocolType.GRPC, False))
    if use_streaming:
        configs.append(("localhost:8001", ProtocolType.GRPC, True))

    # outputs are sum and difference of inputs so set max input
    # values so that they will not overflow the output. This
    # allows us to do an exact match. For float types use 8, 16,
    # 32 int range for fp 16, 32, 64 respectively. When getting
    # class outputs the result value/probability is returned as a
    # float so must use fp32 range in that case.
    rinput_dtype = _range_repr_dtype(input_dtype)
    routput0_dtype = _range_repr_dtype(
        output0_dtype if output0_raw else np.float32)
    routput1_dtype = _range_repr_dtype(
        output1_dtype if output1_raw else np.float32)
    val_min = max(
        np.iinfo(rinput_dtype).min,
        np.iinfo(routput0_dtype).min,
        np.iinfo(routput1_dtype).min) / 2
    val_max = min(
        np.iinfo(rinput_dtype).max,
        np.iinfo(routput0_dtype).max,
        np.iinfo(routput1_dtype).max) / 2

    num_classes = 3

    input0_list = list()
    input1_list = list()
    expected0_list = list()
    expected1_list = list()
    expected0_val_list = list()
    expected1_val_list = list()
    for b in range(batch_size):
        in0 = np.random.randint(low=val_min,
                                high=val_max,
                                size=tensor_shape,
                                dtype=rinput_dtype)
        in1 = np.random.randint(low=val_min,
                                high=val_max,
                                size=tensor_shape,
                                dtype=rinput_dtype)
        if input_dtype != np.object:
            in0 = in0.astype(input_dtype)
            in1 = in1.astype(input_dtype)

        if not swap:
            op0 = in0 + in1
            op1 = in0 - in1
        else:
            op0 = in0 - in1
            op1 = in0 + in1

        expected0_val_list.append(op0)
        expected1_val_list.append(op1)
        if output0_dtype == np.object:
            expected0_list.append(
                np.array([
                    unicode(str(x), encoding='utf-8') for x in (op0.flatten())
                ],
                         dtype=object).reshape(op0.shape))
        else:
            expected0_list.append(op0.astype(output0_dtype))
        if output1_dtype == np.object:
            expected1_list.append(
                np.array([
                    unicode(str(x), encoding='utf-8') for x in (op1.flatten())
                ],
                         dtype=object).reshape(op1.shape))
        else:
            expected1_list.append(op1.astype(output1_dtype))

        if input_dtype == np.object:
            in0n = np.array([str(x) for x in in0.reshape(in0.size)],
                            dtype=object)
            in0 = in0n.reshape(in0.shape)
            in1n = np.array([str(x) for x in in1.reshape(in1.size)],
                            dtype=object)
            in1 = in1n.reshape(in1.shape)

        input0_list.append(in0)
        input1_list.append(in1)

    # prepend size of string to string input string data
    if input_dtype == np.object:
        input0_list_tmp = _prepend_string_size(input0_list)
        input1_list_tmp = _prepend_string_size(input1_list)
    else:
        input0_list_tmp = input0_list
        input1_list_tmp = input1_list

    input0_byte_size = sum([i0.nbytes for i0 in input0_list])
    input1_byte_size = sum([i1.nbytes for i1 in input1_list])

    if output0_dtype == np.object:
        expected0_list_tmp = _prepend_string_size(expected0_list)
    else:
        expected0_list_tmp = expected0_list

    if output1_dtype == np.object:
        expected1_list_tmp = _prepend_string_size(expected1_list)
    else:
        expected1_list_tmp = expected1_list

    # Create and register system/cuda shared memory regions if needed
    shm_handles = su.create_register_set_shm_regions(
        input0_list_tmp, input1_list_tmp, expected0_list_tmp,
        expected1_list_tmp, outputs, shm_region_names, precreated_shm_regions,
        use_system_shared_memory, use_cuda_shared_memory)

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_model_name(pf, input_dtype, output0_dtype,
                                       output1_dtype)

        ctx = InferContext(config[0],
                           config[1],
                           model_name,
                           model_version,
                           correlation_id=correlation_id,
                           streaming=config[2],
                           verbose=True)

        expected0_sort_idx = [
            np.flip(np.argsort(x.flatten()), 0) for x in expected0_val_list
        ]
        expected1_sort_idx = [
            np.flip(np.argsort(x.flatten()), 0) for x in expected1_val_list
        ]

        output_req = {}
        OUTPUT0 = "OUTPUT0"
        OUTPUT1 = "OUTPUT1"
        INPUT0 = "INPUT0"
        INPUT1 = "INPUT1"
        if pf == "libtorch" or pf == "libtorch_nobatch":
            OUTPUT0 = "OUTPUT__0"
            OUTPUT1 = "OUTPUT__1"
            INPUT0 = "INPUT__0"
            INPUT1 = "INPUT__1"
        i = 0
        if "OUTPUT0" in outputs:
            if len(shm_handles) != 0:
                output_req[OUTPUT0] = (InferContext.ResultFormat.RAW,
                                       shm_handles[2])
            else:
                if output0_raw:
                    output_req[OUTPUT0] = InferContext.ResultFormat.RAW
                else:
                    output_req[OUTPUT0] = (InferContext.ResultFormat.CLASS,
                                           num_classes)
            i += 1
        if "OUTPUT1" in outputs:
            if len(shm_handles) != 0:
                output_req[OUTPUT1] = (InferContext.ResultFormat.RAW,
                                       shm_handles[2 + i])
            else:
                if output1_raw:
                    output_req[OUTPUT1] = InferContext.ResultFormat.RAW
                else:
                    output_req[OUTPUT1] = (InferContext.ResultFormat.CLASS,
                                           num_classes)

        if len(shm_handles) != 0:
            results = ctx.run(
                {
                    INPUT0: (shm_handles[0], tensor_shape),
                    INPUT1: (shm_handles[1], tensor_shape)
                }, output_req, batch_size)
        else:
            results = ctx.run({
                INPUT0: input0_list,
                INPUT1: input1_list
            }, output_req, batch_size)

        if not skip_request_id_check:
            global _seen_request_ids
            request_id = ctx.get_last_request_id()
            tester.assertFalse(request_id in _seen_request_ids,
                               "request_id: {}".format(request_id))
            _seen_request_ids.add(request_id)

        tester.assertEqual(ctx.get_last_request_model_name(), model_name)
        if model_version is not None:
            tester.assertEqual(ctx.get_last_request_model_version(),
                               model_version)

        tester.assertEqual(len(results), len(outputs))
        for (result_name, result_val) in iteritems(results):
            for b in range(batch_size):
                if ((result_name == OUTPUT0 and output0_raw)
                        or (result_name == OUTPUT1 and output1_raw)):
                    if result_name == OUTPUT0:
                        tester.assertTrue(
                            np.array_equal(result_val[b], expected0_list[b]),
                            "{}, {} expected: {}, got {}".format(
                                model_name, OUTPUT0, expected0_list[b],
                                result_val[b]))
                    elif result_name == OUTPUT1:
                        tester.assertTrue(
                            np.array_equal(result_val[b], expected1_list[b]),
                            "{}, {} expected: {}, got {}".format(
                                model_name, OUTPUT1, expected1_list[b],
                                result_val[b]))
                    else:
                        tester.assertTrue(
                            False,
                            "unexpected raw result {}".format(result_name))
                else:
                    # num_classes values must be returned and must
                    # match expected top values
                    class_list = result_val[b]
                    tester.assertEqual(len(class_list), num_classes)

                    expected0_flatten = expected0_list[b].flatten()
                    expected1_flatten = expected1_list[b].flatten()

                    for idx, ctuple in enumerate(class_list):
                        if result_name == OUTPUT0:
                            # can't compare indices since could have
                            # different indices with the same
                            # value/prob, so compare that the value of
                            # each index equals the expected
                            # value. Can only compare labels when the
                            # indices are equal.
                            tester.assertEqual(ctuple[1],
                                               expected0_flatten[ctuple[0]])
                            tester.assertEqual(
                                ctuple[1],
                                expected0_flatten[expected0_sort_idx[b][idx]])
                            if ctuple[0] == expected0_sort_idx[b][idx]:
                                tester.assertEqual(
                                    ctuple[2], 'label{}'.format(
                                        expected0_sort_idx[b][idx]))
                        elif result_name == OUTPUT1:
                            tester.assertEqual(ctuple[1],
                                               expected1_flatten[ctuple[0]])
                            tester.assertEqual(
                                ctuple[1],
                                expected1_flatten[expected1_sort_idx[b][idx]])
                        else:
                            tester.assertTrue(
                                False, "unexpected class result {}".format(
                                    result_name))

    # Unregister system/cuda shared memory regions if they exist
    su.unregister_cleanup_shm_regions(shm_handles, precreated_shm_regions,
                                      outputs, use_system_shared_memory,
                                      use_cuda_shared_memory)

    return results
Esempio n. 13
0
    def test_dynamic_model_modify(self):
        input_size = 16
        models_base = ('savedmodel', 'plan')
        models_shape = ((input_size,), (input_size, 1, 1))
        models = list()
        for m in models_base:
            models.append(tu.get_model_name(m, np.float32, np.float32, np.float32))

        # Make sure savedmodel and plan are in the status
        for model_name in models:
            try:
                for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                    ctx = ServerStatusContext(pair[0], pair[1], model_name, True)
                    ss = ctx.get_server_status()
                    self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                    self.assertEqual("inference:0", ss.id)
                    self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                    self.assertEqual(len(ss.model_status), 1)
                    self.assertTrue(model_name in ss.model_status,
                                    "expected status for model " + model_name)
                    for (k, v) in iteritems(ss.model_status[model_name].version_status):
                        self.assertEqual(v.ready_state, server_status.MODEL_READY)
            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))

        # Run inference on the model, both versions 1 and 3
        for version in (1, 3):
            for model_name, model_shape in zip(models_base, models_shape):
                try:
                    iu.infer_exact(self, model_name, model_shape, 1,
                                   np.float32, np.float32, np.float32, swap=(version == 3),
                                   model_version=version)
                except InferenceServerException as ex:
                    self.assertTrue(False, "unexpected error {}".format(ex))

        # Change the model configuration to have the default version
        # policy (so that only version 3) if available.
        for base_name, model_name in zip(models_base, models):
            shutil.copyfile("config.pbtxt." + base_name, "models/" + model_name + "/config.pbtxt")

        time.sleep(5) # wait for models to reload
        for model_name in models:
            try:
                for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                    ctx = ServerStatusContext(pair[0], pair[1], model_name, True)
                    ss = ctx.get_server_status()
                    self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                    self.assertEqual("inference:0", ss.id)
                    self.assertEqual(server_status.SERVER_READY, ss.ready_state)
                    self.assertEqual(len(ss.model_status), 1)
                    self.assertTrue(model_name in ss.model_status,
                                    "expected status for model " + model_name)
                    self.assertTrue(1 in ss.model_status[model_name].version_status,
                                    "expected status for version 1 of model " + model_name)
                    self.assertTrue(3 in ss.model_status[model_name].version_status,
                                    "expected status for version 3 of model " + model_name)
                    self.assertEqual(ss.model_status[model_name].version_status[1].ready_state,
                                     server_status.MODEL_UNAVAILABLE)
                    self.assertEqual(ss.model_status[model_name].version_status[3].ready_state,
                                     server_status.MODEL_READY)
            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))

        # Attempt inferencing using version 1, should fail since
        # change in model policy makes that no longer available.
        for model_name, model_shape in zip(models_base, models_shape):
            try:
                iu.infer_exact(self, model_name, model_shape, 1,
                               np.float32, np.float32, np.float32, swap=False,
                               model_version=1)
                self.assertTrue(False, "expected error for unavailable model " + model_name)
            except InferenceServerException as ex:
                self.assertEqual("inference:0", ex.server_id())
                self.assertTrue(
                    ex.message().startswith("Inference request for unknown model"))

        # Version 3 should continue to work...
        for model_name, model_shape in zip(models_base, models_shape):
            try:
                iu.infer_exact(self, model_name, model_shape, 1,
                               np.float32, np.float32, np.float32, swap=True,
                               model_version=3)
            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))
Esempio n. 14
0
    def test_dynamic_version_load_unload_disabled(self):
        input_size = 16
        tensor_shape = (input_size,)
        graphdef_name = tu.get_model_name('graphdef', np.int32, np.int32, np.int32)

        # Add a new version to the model store and give it time to
        # load. But it shouldn't load because dynamic loading is
        # disabled.
        try:
            shutil.copytree("models/" + graphdef_name + "/2",
                            "models/" + graphdef_name + "/7")
            time.sleep(5) # wait for model to load
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(graphdef_name in ss.model_status,
                                "expected status for model " + graphdef_name)
                self.assertFalse(7 in ss.model_status[graphdef_name].version_status,
                                "unexpected status for version 7 of model " + graphdef_name)
                self.assertEqual(len(ss.model_status[graphdef_name].version_status), 3)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Remove one of the original versions from the model
        # store. Unloading is disabled so it should remain available
        # in the status.
        try:
            shutil.rmtree("models/" + graphdef_name + "/1")
            time.sleep(5) # wait for version to unload (but it shouldn't)
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(graphdef_name in ss.model_status,
                                "expected status for model " + graphdef_name)
                self.assertTrue(1 in ss.model_status[graphdef_name].version_status,
                                "expected status for version 1 of model " + graphdef_name)

                self.assertEqual(len(ss.model_status[graphdef_name].version_status), 3)
                for (k, v) in iteritems(ss.model_status[graphdef_name].version_status):
                    self.assertEqual(v.ready_state, server_status.MODEL_READY)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Run inference to make sure model still being served even
        # though version deleted from model store
        try:
            iu.infer_exact(self, 'graphdef', tensor_shape, 1,
                           np.int32, np.int32, np.int32, swap=False,
                           model_version=1)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))
Esempio n. 15
0
    def test_dynamic_version_load_unload(self):
        input_size = 16
        tensor_shape = (input_size,)
        graphdef_name = tu.get_model_name('graphdef', np.int32, np.int32, np.int32)

        # There are 3 versions. Make sure that all have status and are
        # ready.
        try:
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(graphdef_name in ss.model_status,
                                "expected status for model " + graphdef_name)
                self.assertEqual(len(ss.model_status[graphdef_name].version_status), 3)
                for (k, v) in iteritems(ss.model_status[graphdef_name].version_status):
                    self.assertEqual(v.ready_state, server_status.MODEL_READY)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Run inference on version 1 to make sure it is available
        try:
            iu.infer_exact(self, 'graphdef', tensor_shape, 1,
                           np.int32, np.int32, np.int32, swap=False,
                           model_version=1)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Make sure version 1 has execution stats in the status.
        expected_exec_cnt = 0
        try:
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(graphdef_name in ss.model_status,
                                "expected status for model " + graphdef_name)
                self.assertTrue(1 in ss.model_status[graphdef_name].version_status,
                                "expected status for version 1 of model " + graphdef_name)

                version_status = ss.model_status[graphdef_name].version_status[1]
                self.assertEqual(version_status.ready_state, server_status.MODEL_READY)
                self.assertGreater(version_status.model_execution_count, 0)
                expected_exec_cnt = version_status.model_execution_count
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Remove version 1 from the model store and give it time to
        # unload. Make sure that it has a status but is unavailable.
        try:
            shutil.rmtree("models/" + graphdef_name + "/1")
            time.sleep(5) # wait for version to unload
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(graphdef_name in ss.model_status,
                                "expected status for model " + graphdef_name)
                self.assertTrue(1 in ss.model_status[graphdef_name].version_status,
                                "expected status for version 1 of model " + graphdef_name)

                version_status = ss.model_status[graphdef_name].version_status[1]
                self.assertEqual(version_status.ready_state, server_status.MODEL_UNAVAILABLE)
                self.assertEqual(version_status.model_execution_count, expected_exec_cnt)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Version is removed so inference should fail
        try:
            iu.infer_exact(self, 'graphdef', tensor_shape, 1,
                           np.int32, np.int32, np.int32, swap=False,
                           model_version=1)
            self.assertTrue(False, "expected error for unavailable model " + graphdef_name)
        except InferenceServerException as ex:
            self.assertEqual("inference:0", ex.server_id())
            self.assertTrue(
                ex.message().startswith(
                    "Inference request for unknown model 'graphdef_int32_int32_int32'"))

        # Add back the same version. The status/stats should be
        # retained for versions (note that this is different behavior
        # than if a model is removed and then added back).
        try:
            shutil.copytree("models/" + graphdef_name + "/2",
                            "models/" + graphdef_name + "/1")
            time.sleep(5) # wait for model to load
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(graphdef_name in ss.model_status,
                                "expected status for model " + graphdef_name)
                self.assertEqual(len(ss.model_status[graphdef_name].version_status), 3)
                for (k, v) in iteritems(ss.model_status[graphdef_name].version_status):
                    self.assertEqual(v.ready_state, server_status.MODEL_READY)
                    if k == 1:
                        self.assertEqual(v.model_execution_count, expected_exec_cnt)
                    else:
                        self.assertEqual(v.model_execution_count, 0)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Add another version from the model store.
        try:
            shutil.copytree("models/" + graphdef_name + "/2",
                            "models/" + graphdef_name + "/7")
            time.sleep(5) # wait for version to load
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(graphdef_name in ss.model_status,
                                "expected status for model " + graphdef_name)
                self.assertTrue(7 in ss.model_status[graphdef_name].version_status,
                                "expected status for version 7 of model " + graphdef_name)

                self.assertEqual(len(ss.model_status[graphdef_name].version_status), 4)
                for (k, v) in iteritems(ss.model_status[graphdef_name].version_status):
                    self.assertEqual(v.ready_state, server_status.MODEL_READY)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))
Esempio n. 16
0
    def test_dynamic_model_load_unload_disabled(self):
        input_size = 16
        tensor_shape = (input_size,)
        savedmodel_name = tu.get_model_name('savedmodel', np.float32, np.float32, np.float32)
        netdef_name = tu.get_model_name('netdef', np.float32, np.float32, np.float32)

        # Make sure savedmodel model is not in the status (because
        # initially it is not in the model store)
        try:
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True)
                ss = ctx.get_server_status()
                self.assertTrue(False, "expected status failure for " + savedmodel_name)
        except InferenceServerException as ex:
            self.assertEqual("inference:0", ex.server_id())
            self.assertGreater(ex.request_id(), 0)
            self.assertTrue(
                ex.message().startswith("no status available for unknown model"))

        # Add savedmodel model to the model store and give it time to
        # load. But it shouldn't load because dynamic loading is disabled.
        try:
            shutil.copytree(savedmodel_name, "models/" + savedmodel_name)
            time.sleep(5) # wait for model to load
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True)
                ss = ctx.get_server_status()
                self.assertTrue(False, "expected status failure for " + savedmodel_name)
        except InferenceServerException as ex:
            self.assertEqual("inference:0", ex.server_id())
            self.assertGreater(ex.request_id(), 0)
            self.assertTrue(
                ex.message().startswith("no status available for unknown model"))

        # Run inference which should fail because the model isn't there
        try:
            iu.infer_exact(self, 'savedmodel', tensor_shape, 1,
                           np.float32, np.float32, np.float32, swap=True)
            self.assertTrue(False, "expected error for unavailable model " + savedmodel_name)
        except InferenceServerException as ex:
            self.assertEqual("inference:0", ex.server_id())
            self.assertGreater(ex.request_id(), 0)
            self.assertTrue(
                ex.message().startswith("no status available for unknown model"))

        # Remove one of the original models from the model
        # store. Unloading is disabled so it should remain available
        # in the status.
        try:
            shutil.rmtree("models/" + netdef_name)
            time.sleep(5) # wait for model to unload (but it shouldn't)
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], netdef_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(netdef_name in ss.model_status,
                                "expected status for model " + netdef_name)
                self.assertTrue(3 in ss.model_status[netdef_name].version_status,
                                "expected status for version 3 of model " + netdef_name)

                version_status = ss.model_status[netdef_name].version_status[3]
                self.assertEqual(version_status.ready_state, server_status.MODEL_READY)

        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Run inference to make sure model still being served even
        # though deleted from model store
        try:
            iu.infer_exact(self, 'netdef', tensor_shape, 1,
                           np.float32, np.float32, np.float32, swap=True)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))
Esempio n. 17
0
    def test_dynamic_model_load_unload(self):
        input_size = 16
        tensor_shape = (input_size,)
        savedmodel_name = tu.get_model_name('savedmodel', np.float32, np.float32, np.float32)
        netdef_name = tu.get_model_name('netdef', np.float32, np.float32, np.float32)

        # Make sure savedmodel model is not in the status (because
        # initially it is not in the model store)
        try:
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True)
                ss = ctx.get_server_status()
                self.assertTrue(False, "expected status failure for " + savedmodel_name)
        except InferenceServerException as ex:
            self.assertEqual("inference:0", ex.server_id())
            self.assertTrue(
                ex.message().startswith("no status available for unknown model"))

        # Add savedmodel model to the model store and give it time to
        # load. Make sure that it has a status and is ready.
        try:
            shutil.copytree(savedmodel_name, "models/" + savedmodel_name)
            time.sleep(5) # wait for model to load
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(savedmodel_name in ss.model_status,
                                "expected status for model " + savedmodel_name)
                for (k, v) in iteritems(ss.model_status[savedmodel_name].version_status):
                    self.assertEqual(v.ready_state, server_status.MODEL_READY)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Run inference on the just loaded model
        try:
            iu.infer_exact(self, 'savedmodel', tensor_shape, 1,
                           np.float32, np.float32, np.float32, swap=True)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Make sure savedmodel has execution stats in the status.
        expected_exec_cnt = 0
        try:
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(savedmodel_name in ss.model_status,
                                "expected status for model " + savedmodel_name)
                self.assertTrue(3 in ss.model_status[savedmodel_name].version_status,
                                "expected status for version 3 of model " + savedmodel_name)

                version_status = ss.model_status[savedmodel_name].version_status[3]
                self.assertEqual(version_status.ready_state, server_status.MODEL_READY)
                self.assertGreater(version_status.model_execution_count, 0)
                expected_exec_cnt = version_status.model_execution_count
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Remove savedmodel model from the model store and give it
        # time to unload. Make sure that it has a status but is
        # unavailable.
        try:
            shutil.rmtree("models/" + savedmodel_name)
            time.sleep(5) # wait for model to unload
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(savedmodel_name in ss.model_status,
                                "expected status for model " + savedmodel_name)
                self.assertTrue(3 in ss.model_status[savedmodel_name].version_status,
                                "expected status for version 3 of model " + savedmodel_name)

                version_status = ss.model_status[savedmodel_name].version_status[3]
                self.assertEqual(version_status.ready_state, server_status.MODEL_UNAVAILABLE)
                self.assertEqual(version_status.model_execution_count, expected_exec_cnt)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Model is removed so inference should fail
        try:
            iu.infer_exact(self, 'savedmodel', tensor_shape, 1,
                           np.float32, np.float32, np.float32, swap=True)
            self.assertTrue(False, "expected error for unavailable model " + savedmodel_name)
        except InferenceServerException as ex:
            self.assertEqual("inference:0", ex.server_id())
            self.assertTrue(
                ex.message().startswith(
                    "Inference request for unknown model 'savedmodel_float32_float32_float32'"))

        # Add back the same model. The status/stats should be reset.
        try:
            shutil.copytree(savedmodel_name, "models/" + savedmodel_name)
            time.sleep(5) # wait for model to load
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(savedmodel_name in ss.model_status,
                                "expected status for model " + savedmodel_name)
                for (k, v) in iteritems(ss.model_status[savedmodel_name].version_status):
                    self.assertEqual(v.ready_state, server_status.MODEL_READY)
                    self.assertEqual(v.model_execution_count, 0)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Remove original model from the model store and give it time
        # to unload. Make sure that it has a status but is
        # unavailable.
        try:
            shutil.rmtree("models/" + netdef_name)
            time.sleep(5) # wait for model to unload
            for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]:
                ctx = ServerStatusContext(pair[0], pair[1], netdef_name, True)
                ss = ctx.get_server_status()
                self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version)
                self.assertEqual("inference:0", ss.id)
                self.assertEqual(server_status.SERVER_READY, ss.ready_state)

                self.assertEqual(len(ss.model_status), 1)
                self.assertTrue(netdef_name in ss.model_status,
                                "expected status for model " + netdef_name)
                self.assertTrue(3 in ss.model_status[netdef_name].version_status,
                                "expected status for version 3 of model " + netdef_name)

                version_status = ss.model_status[netdef_name].version_status[3]
                self.assertEqual(version_status.ready_state, server_status.MODEL_UNAVAILABLE)
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))

        # Model is removed so inference should fail
        try:
            iu.infer_exact(self, 'netdef', tensor_shape, 1,
                           np.float32, np.float32, np.float32, swap=True)
            self.assertTrue(False, "expected error for unavailable model " + netdef_name)
        except InferenceServerException as ex:
            self.assertEqual("inference:0", ex.server_id())
            self.assertTrue(
                ex.message().startswith(
                    "Inference request for unknown model 'netdef_float32_float32_float32'"))
def infer_exact(tester,
                pf,
                tensor_shape,
                batch_size,
                req_raw,
                input_dtype,
                output0_dtype,
                output1_dtype,
                model_version=None,
                swap=False,
                outputs=("OUTPUT0", "OUTPUT1"),
                use_http=True,
                use_grpc=True,
                skip_request_id_check=False,
                send_input_shape=False):
    tester.assertTrue(use_http or use_grpc)
    protocols = []
    if use_http:
        protocols.append(("localhost:8000", ProtocolType.HTTP))
    if use_grpc:
        protocols.append(("localhost:8001", ProtocolType.GRPC))

    for pair in protocols:
        model_name = tu.get_model_name(pf, input_dtype, output0_dtype,
                                       output1_dtype)

        # outputs are sum and difference of inputs so set max input
        # values so that they will not overflow the output. This
        # allows us to do an exact match. For float types use 8, 16,
        # 32 int range for fp 16, 32, 64 respectively. When getting
        # class outputs the result value/probability is returned as a
        # float so must use fp32 range in that case.
        rinput_dtype = _range_repr_dtype(input_dtype)
        routput0_dtype = _range_repr_dtype(
            output0_dtype if req_raw else np.float32)
        routput1_dtype = _range_repr_dtype(
            output1_dtype if req_raw else np.float32)
        val_min = max(
            np.iinfo(rinput_dtype).min,
            np.iinfo(routput0_dtype).min,
            np.iinfo(routput1_dtype).min) / 2
        val_max = min(
            np.iinfo(rinput_dtype).max,
            np.iinfo(routput0_dtype).max,
            np.iinfo(routput1_dtype).max) / 2

        num_classes = 3

        input0_list = list()
        input1_list = list()
        expected0_list = list()
        expected1_list = list()
        expected0_val_list = list()
        expected1_val_list = list()
        for b in range(batch_size):
            in0 = np.random.randint(low=val_min,
                                    high=val_max,
                                    size=tensor_shape,
                                    dtype=rinput_dtype)
            in1 = np.random.randint(low=val_min,
                                    high=val_max,
                                    size=tensor_shape,
                                    dtype=rinput_dtype)
            if input_dtype != np.object:
                in0 = in0.astype(input_dtype)
                in1 = in1.astype(input_dtype)

            if not swap:
                op0 = in0 + in1
                op1 = in0 - in1
            else:
                op0 = in0 - in1
                op1 = in0 + in1

            expected0_val_list.append(op0)
            expected1_val_list.append(op1)
            if output0_dtype == np.object:
                expected0_list.append(
                    np.array([
                        bytes(str(x), encoding='utf-8')
                        for x in (op0.flatten())
                    ],
                             dtype=object).reshape(op1.shape))
            else:
                expected0_list.append(op0)
            if output1_dtype == np.object:
                expected1_list.append(
                    np.array([
                        bytes(str(x), encoding='utf-8')
                        for x in (op1.flatten())
                    ],
                             dtype=object).reshape(op1.shape))
            else:
                expected1_list.append(op1)

            if input_dtype == np.object:
                in0n = np.array([str(x) for x in in0.reshape(in0.size)],
                                dtype=object)
                in0 = in0n.reshape(in0.shape)
                in1n = np.array([str(x) for x in in1.reshape(in1.size)],
                                dtype=object)
                in1 = in1n.reshape(in1.shape)

            input0_list.append(in0)
            input1_list.append(in1)

        expected0_sort_idx = [
            np.flip(np.argsort(x.flatten()), 0) for x in expected0_val_list
        ]
        expected1_sort_idx = [
            np.flip(np.argsort(x.flatten()), 0) for x in expected1_val_list
        ]

        output_req = {}
        for o in outputs:
            if req_raw:
                output_req[o] = InferContext.ResultFormat.RAW
            else:
                output_req[o] = (InferContext.ResultFormat.CLASS, num_classes)

        ctx = InferContext(pair[0], pair[1], model_name, model_version, True)
        results = ctx.run({
            "INPUT0": input0_list,
            "INPUT1": input1_list
        }, output_req, batch_size, {
            "INPUT0": tensor_shape,
            "INPUT1": tensor_shape
        } if (send_input_shape) else None)

        if not skip_request_id_check:
            global _last_request_id
            min_request_id = _last_request_id + 1
            request_id = ctx.get_last_request_id()
            _last_request_id = request_id
            tester.assertGreaterEqual(request_id, min_request_id)

        tester.assertEqual(ctx.get_last_request_model_name(), model_name)
        if model_version is not None:
            tester.assertEqual(ctx.get_last_request_model_version(),
                               model_version)

        tester.assertEqual(len(results), len(outputs))
        for (result_name, result_val) in iteritems(results):
            for b in range(batch_size):
                if req_raw:
                    if result_name == "OUTPUT0":
                        tester.assertTrue(
                            np.array_equal(result_val[b], expected0_list[b]),
                            "{}, OUTPUT0 expected: {}, got {}".format(
                                model_name, expected0_list[b], result_val[b]))
                    elif result_name == "OUTPUT1":
                        tester.assertTrue(
                            np.array_equal(result_val[b], expected1_list[b]),
                            "{}, OUTPUT1 expected: {}, got {}".format(
                                model_name, expected1_list[b], result_val[b]))
                    else:
                        tester.assertTrue(
                            False,
                            "unexpected raw result {}".format(result_name))
                else:
                    # num_classes values must be returned and must
                    # match expected top values
                    class_list = result_val[b]
                    tester.assertEqual(len(class_list), num_classes)

                    expected0_flatten = expected0_list[b].flatten()
                    expected1_flatten = expected1_list[b].flatten()

                    for idx, ctuple in enumerate(class_list):
                        if result_name == "OUTPUT0":
                            # can't compare indices since could have
                            # different indices with the same
                            # value/prob, so compare that the value of
                            # each index equals the expected
                            # value. Can only compare labels when the
                            # indices are equal.
                            tester.assertEqual(ctuple[1],
                                               expected0_flatten[ctuple[0]])
                            tester.assertEqual(
                                ctuple[1],
                                expected0_flatten[expected0_sort_idx[b][idx]])
                            if ctuple[0] == expected0_sort_idx[b][idx]:
                                tester.assertEqual(
                                    ctuple[2], 'label{}'.format(
                                        expected0_sort_idx[b][idx]))
                        elif result_name == "OUTPUT1":
                            tester.assertEqual(ctuple[1],
                                               expected1_flatten[ctuple[0]])
                            tester.assertEqual(
                                ctuple[1],
                                expected1_flatten[expected1_sort_idx[b][idx]])
                        else:
                            tester.assertTrue(
                                False, "unexpected class result {}".format(
                                    result_name))
    return results
def create_ensemble_modelconfig(
        base_model, models_dir, max_batch, model_version,
        input_shape, output0_shape, output1_shape,
        input_dtype, output0_dtype, output1_dtype,
        output0_label_cnt, version_policy):

    # No validation as long as the base model supports the type and shape

    # Unpack version policy
    version_policy_str = "{ latest { num_versions: 1 }}"
    if version_policy is not None:
        type, val = version_policy
        if type == 'latest':
            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(val)
        elif type == 'specific':
            version_policy_str = "{{ specific {{ versions: {} }}}}".format(val)
        else:
            version_policy_str = "{ all { }}"

    input_model_dtype = np_to_model_dtype(input_dtype)
    output0_model_dtype = np_to_model_dtype(output0_dtype)
    output1_model_dtype = np_to_model_dtype(output1_dtype)

    for ensemble_type in BASIC_ENSEMBLE_TYPES:
        # Use a different model name for the non-batching variant
        ensemble_model_name = "{}_{}{}".format(ensemble_type, base_model, "_nobatch" if max_batch == 0 else "")
        model_name = tu.get_model_name(ensemble_model_name,
                                    input_dtype, output0_dtype, output1_dtype)
        base_model_name = tu.get_model_name("{}{}".format(base_model, "_nobatch" if max_batch == 0 else ""),
                                    input_dtype, output0_dtype, output1_dtype)

        ensemble_schedule = EnsembleSchedule(ensemble_type).get_schedule(
                        base_model_name, input_shape, output0_shape,
                        output1_shape, input_model_dtype,
                        output0_model_dtype, output1_model_dtype)

        config_dir = models_dir + "/" + model_name
        config = '''
name: "{}"
platform: "ensemble"
max_batch_size: {}
version_policy: {}
input [
  {{
    name: "INPUT0"
    data_type: {}
    dims: [ {} ]
  }},
  {{
    name: "INPUT1"
    data_type: {}
    dims: [ {} ]
  }}
]
output [
  {{
    name: "OUTPUT0"
    data_type: {}
    dims: [ {} ]
    label_filename: "output0_labels.txt"
  }},
  {{
    name: "OUTPUT1"
    data_type: {}
    dims: [ {} ]
  }}
]
{}
'''.format(model_name, max_batch, version_policy_str,
            input_model_dtype, tu.shape_to_dims_str(input_shape),
            input_model_dtype, tu.shape_to_dims_str(input_shape),
            output0_model_dtype, tu.shape_to_dims_str(output0_shape),
            output1_model_dtype, tu.shape_to_dims_str(output1_shape),
            ensemble_schedule)

        try:
            os.makedirs(config_dir)
        except OSError as ex:
            pass # ignore existing dir

        with open(config_dir + "/config.pbtxt", "w") as cfile:
            cfile.write(config)

        with open(config_dir + "/output0_labels.txt", "w") as lfile:
            for l in range(output0_label_cnt):
                lfile.write("label" + str(l) + "\n")
Esempio n. 20
0
def infer_exact(tester,
                pf,
                tensor_shape,
                batch_size,
                input_dtype,
                output0_dtype,
                output1_dtype,
                output0_raw=True,
                output1_raw=True,
                model_version=None,
                swap=False,
                outputs=("OUTPUT0", "OUTPUT1"),
                use_http=True,
                use_grpc=True,
                use_http_json_tensors=True,
                skip_request_id_check=False,
                use_streaming=True,
                correlation_id=0,
                shm_region_names=None,
                precreated_shm_regions=None,
                use_system_shared_memory=False,
                use_cuda_shared_memory=False,
                priority=0,
                timeout_us=0):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    # configs [ url, protocol, async stream, binary data ]
    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False, True))
        if output0_raw == output1_raw:
            # Float16 not supported for Input and Output via JSON
            if use_http_json_tensors and (input_dtype != np.float16) and \
               (output0_dtype != np.float16) and (output1_dtype != np.float16):
                configs.append(("localhost:8000", "http", False, False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False, False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True, False))

    # outputs are sum and difference of inputs so set max input
    # values so that they will not overflow the output. This
    # allows us to do an exact match. For float types use 8, 16,
    # 32 int range for fp 16, 32, 64 respectively. When getting
    # class outputs the result value/probability is returned as a
    # float so must use fp32 range in that case.
    rinput_dtype = _range_repr_dtype(input_dtype)
    routput0_dtype = _range_repr_dtype(
        output0_dtype if output0_raw else np.float32)
    routput1_dtype = _range_repr_dtype(
        output1_dtype if output1_raw else np.float32)
    val_min = max(
        np.iinfo(rinput_dtype).min,
        np.iinfo(routput0_dtype).min,
        np.iinfo(routput1_dtype).min) / 2
    val_max = min(
        np.iinfo(rinput_dtype).max,
        np.iinfo(routput0_dtype).max,
        np.iinfo(routput1_dtype).max) / 2

    num_classes = 3

    input0_array = np.random.randint(low=val_min,
                                     high=val_max,
                                     size=tensor_shape,
                                     dtype=rinput_dtype)
    input1_array = np.random.randint(low=val_min,
                                     high=val_max,
                                     size=tensor_shape,
                                     dtype=rinput_dtype)
    if input_dtype != np.object:
        input0_array = input0_array.astype(input_dtype)
        input1_array = input1_array.astype(input_dtype)

    if not swap:
        output0_array = input0_array + input1_array
        output1_array = input0_array - input1_array
    else:
        output0_array = input0_array - input1_array
        output1_array = input0_array + input1_array

    if output0_dtype == np.object:
        output0_array = np.array([
            unicode(str(x), encoding='utf-8')
            for x in (output0_array.flatten())
        ],
                                 dtype=object).reshape(output0_array.shape)
    else:
        output0_array = output0_array.astype(output0_dtype)
    if output1_dtype == np.object:
        output1_array = np.array([
            unicode(str(x), encoding='utf-8')
            for x in (output1_array.flatten())
        ],
                                 dtype=object).reshape(output1_array.shape)
    else:
        output1_array = output1_array.astype(output1_dtype)

    if input_dtype == np.object:
        in0n = np.array(
            [str(x) for x in input0_array.reshape(input0_array.size)],
            dtype=object)
        input0_array = in0n.reshape(input0_array.shape)
        in1n = np.array(
            [str(x) for x in input1_array.reshape(input1_array.size)],
            dtype=object)
        input1_array = in1n.reshape(input1_array.shape)

    # prepend size of string to output string data
    if output0_dtype == np.object:
        if batch_size == 1:
            output0_array_tmp = serialize_byte_tensor_list([output0_array])
        else:
            output0_array_tmp = serialize_byte_tensor_list(output0_array)
    else:
        output0_array_tmp = output0_array

    if output1_dtype == np.object:
        if batch_size == 1:
            output1_array_tmp = serialize_byte_tensor_list([output1_array])
        else:
            output1_array_tmp = serialize_byte_tensor_list(output1_array)
    else:
        output1_array_tmp = output1_array

    # Get model platform
    model_name = tu.get_model_name(pf, input_dtype, output0_dtype,
                                   output1_dtype)
    if configs[0][1] == "http":
        metadata_client = httpclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata["platform"]
    else:
        metadata_client = grpcclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata.platform

    if platform == "pytorch_libtorch":
        OUTPUT0 = "OUTPUT__0"
        OUTPUT1 = "OUTPUT__1"
        INPUT0 = "INPUT__0"
        INPUT1 = "INPUT__1"
    else:
        OUTPUT0 = "OUTPUT0"
        OUTPUT1 = "OUTPUT1"
        INPUT0 = "INPUT0"
        INPUT1 = "INPUT1"

    output0_byte_size = sum([o0.nbytes for o0 in output0_array_tmp])
    output1_byte_size = sum([o1.nbytes for o1 in output1_array_tmp])

    if batch_size == 1:
        input0_list = [input0_array]
        input1_list = [input1_array]
    else:
        input0_list = [x for x in input0_array]
        input1_list = [x for x in input1_array]

    # Serialization of string tensors in the case of shared memory must be done manually
    if input_dtype == np.object:
        input0_list_tmp = serialize_byte_tensor_list(input0_list)
        input1_list_tmp = serialize_byte_tensor_list(input1_list)
    else:
        input0_list_tmp = input0_list
        input1_list_tmp = input1_list

    input0_byte_size = sum([i0.nbytes for i0 in input0_list_tmp])
    input1_byte_size = sum([i1.nbytes for i1 in input1_list_tmp])

    # Create system/cuda shared memory regions if needed
    shm_regions, shm_handles = su.create_set_shm_regions(
        input0_list_tmp, input1_list_tmp, output0_byte_size, output1_byte_size,
        outputs, shm_region_names, precreated_shm_regions,
        use_system_shared_memory, use_cuda_shared_memory)

    if model_version is not None:
        model_version = str(model_version)
    else:
        model_version = ""

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_model_name(pf, input_dtype, output0_dtype,
                                       output1_dtype)

        if config[1] == "http":
            triton_client = httpclient.InferenceServerClient(config[0],
                                                             verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(config[0],
                                                             verbose=True)

        inputs = []
        if config[1] == "http":
            inputs.append(
                httpclient.InferInput(INPUT0, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))
            inputs.append(
                httpclient.InferInput(INPUT1, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))
        else:
            inputs.append(
                grpcclient.InferInput(INPUT0, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))
            inputs.append(
                grpcclient.InferInput(INPUT1, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))

        if not (use_cuda_shared_memory or use_system_shared_memory):
            if config[1] == "http":
                inputs[0].set_data_from_numpy(input0_array,
                                              binary_data=config[3])
                inputs[1].set_data_from_numpy(input1_array,
                                              binary_data=config[3])
            else:
                inputs[0].set_data_from_numpy(input0_array)
                inputs[1].set_data_from_numpy(input1_array)
        else:
            # Register necessary shared memory regions/handles
            su.register_add_shm_regions(inputs, outputs, shm_regions,
                                        precreated_shm_regions, shm_handles,
                                        input0_byte_size, input1_byte_size,
                                        output0_byte_size, output1_byte_size,
                                        use_system_shared_memory,
                                        use_cuda_shared_memory, triton_client)

        if batch_size == 1:
            expected0_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output0_array.reshape((1, ) + tensor_shape)
            ]
            expected1_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output1_array.reshape((1, ) + tensor_shape)
            ]
        else:
            expected0_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output0_array.reshape(tensor_shape)
            ]
            expected1_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output1_array.reshape(tensor_shape)
            ]

        # Force binary_data = False for shared memory and class
        output_req = []
        i = 0
        if "OUTPUT0" in outputs:
            if len(shm_regions) != 0:
                if config[1] == "http":
                    output_req.append(
                        httpclient.InferRequestedOutput(OUTPUT0,
                                                        binary_data=config[3]))
                else:
                    output_req.append(grpcclient.InferRequestedOutput(OUTPUT0))

                output_req[-1].set_shared_memory(shm_regions[2] + '_data',
                                                 output0_byte_size)
            else:
                if output0_raw:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT0, binary_data=config[3]))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(OUTPUT0))
                else:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT0,
                                binary_data=config[3],
                                class_count=num_classes))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(
                                OUTPUT0, class_count=num_classes))
            i += 1
        if "OUTPUT1" in outputs:
            if len(shm_regions) != 0:
                if config[1] == "http":
                    output_req.append(
                        httpclient.InferRequestedOutput(OUTPUT1,
                                                        binary_data=config[3]))
                else:
                    output_req.append(grpcclient.InferRequestedOutput(OUTPUT1))

                output_req[-1].set_shared_memory(shm_regions[2 + i] + '_data',
                                                 output1_byte_size)
            else:
                if output1_raw:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT1, binary_data=config[3]))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(OUTPUT1))
                else:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT1,
                                binary_data=config[3],
                                class_count=num_classes))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(
                                OUTPUT1, class_count=num_classes))

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(
                    model_name,
                    inputs,
                    model_version=model_version,
                    outputs=output_req,
                    request_id=str(_unique_request_id()))
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()))

        last_response = results.get_response()

        if not skip_request_id_check:
            global _seen_request_ids
            if config[1] == "http":
                request_id = int(last_response["id"])
            else:
                request_id = int(last_response.id)
            tester.assertFalse(request_id in _seen_request_ids,
                               "request_id: {}".format(request_id))
            _seen_request_ids.add(request_id)

        if config[1] == "http":
            response_model_name = last_response["model_name"]
            if model_version != "":
                response_model_version = last_response["model_version"]
            response_outputs = last_response["outputs"]
        else:
            response_model_name = last_response.model_name
            if model_version != "":
                response_model_version = last_response.model_version
            response_outputs = last_response.outputs

        tester.assertEqual(response_model_name, model_name)

        if model_version != "":
            tester.assertEqual(str(response_model_version), model_version)

        tester.assertEqual(len(response_outputs), len(outputs))

        for result in response_outputs:
            if config[1] == "http":
                result_name = result["name"]
            else:
                result_name = result.name

            if ((result_name == OUTPUT0 and output0_raw)
                    or (result_name == OUTPUT1 and output1_raw)):
                if use_system_shared_memory or use_cuda_shared_memory:
                    if result_name == OUTPUT0:
                        shm_handle = shm_handles[2]
                    else:
                        shm_handle = shm_handles[3]

                    output = results.get_output(result_name)
                    if config[1] == "http":
                        output_datatype = output['datatype']
                        output_shape = output['shape']
                    else:
                        output_datatype = output.datatype
                        output_shape = output.shape
                    output_dtype = triton_to_np_dtype(output_datatype)
                if use_system_shared_memory:
                    output_data = shm.get_contents_as_numpy(
                        shm_handle, output_dtype, output_shape)
                elif use_cuda_shared_memory:
                    output_data = cudashm.get_contents_as_numpy(
                        shm_handle, output_dtype, output_shape)
                else:
                    output_data = results.as_numpy(result_name)

                if (output_data.dtype == np.object) and (config[3] == False):
                    output_data = output_data.astype(np.bytes_)

                if result_name == OUTPUT0:
                    tester.assertTrue(
                        np.array_equal(output_data, output0_array),
                        "{}, {} expected: {}, got {}".format(
                            model_name, OUTPUT0, output0_array, output_data))
                elif result_name == OUTPUT1:
                    tester.assertTrue(
                        np.array_equal(output_data, output1_array),
                        "{}, {} expected: {}, got {}".format(
                            model_name, OUTPUT1, output1_array, output_data))
                else:
                    tester.assertTrue(
                        False, "unexpected raw result {}".format(result_name))
            else:
                for b in range(batch_size):
                    # num_classes values must be returned and must
                    # match expected top values
                    if "nobatch" in pf:
                        class_list = results.as_numpy(result_name)
                    else:
                        class_list = results.as_numpy(result_name)[b]

                    tester.assertEqual(len(class_list), num_classes)
                    if batch_size == 1:
                        expected0_flatten = output0_array.flatten()
                        expected1_flatten = output1_array.flatten()
                    else:
                        expected0_flatten = output0_array[b].flatten()
                        expected1_flatten = output1_array[b].flatten()

                    for idx, class_label in enumerate(class_list):
                        # can't compare indices since could have different
                        # indices with the same value/prob, so check that
                        # the value of each index equals the expected value.
                        # Only compare labels when the indices are equal.
                        if type(class_label) == str:
                            ctuple = class_label.split(':')
                        else:
                            ctuple = "".join(chr(x)
                                             for x in class_label).split(':')
                        cval = float(ctuple[0])
                        cidx = int(ctuple[1])
                        if result_name == OUTPUT0:
                            tester.assertEqual(cval, expected0_flatten[cidx])
                            tester.assertEqual(
                                cval,
                                expected0_flatten[expected0_sort_idx[b][idx]])
                            if cidx == expected0_sort_idx[b][idx]:
                                tester.assertEqual(
                                    ctuple[2], 'label{}'.format(
                                        expected0_sort_idx[b][idx]))
                        elif result_name == OUTPUT1:
                            tester.assertEqual(cval, expected1_flatten[cidx])
                            tester.assertEqual(
                                cval,
                                expected1_flatten[expected1_sort_idx[b][idx]])
                        else:
                            tester.assertTrue(
                                False, "unexpected class result {}".format(
                                    result_name))

    # Unregister system/cuda shared memory regions if they exist
    su.unregister_cleanup_shm_regions(shm_regions, shm_handles,
                                      precreated_shm_regions, outputs,
                                      use_system_shared_memory,
                                      use_cuda_shared_memory)

    return results
Esempio n. 21
0
 def test_load_wrong_optimization_profile(self):
     client = tritonhttpclient.InferenceServerClient("localhost:8000")
     model_name = tu.get_model_name(self.model_name_, self.dtype_, self.dtype_, self.dtype_)
     model_status = client.is_model_ready(model_name, "1")
     self.assertFalse(model_status, "expected model to be not ready")
def create_plan_fixed_modelfile(models_dir, max_batch, model_version,
                                input_shape, output0_shape, output1_shape,
                                input_dtype, output0_dtype, output1_dtype,
                                input_memory_format, output_memory_format):
    trt_input_dtype = np_to_trt_dtype(input_dtype)
    trt_output0_dtype = np_to_trt_dtype(output0_dtype)
    trt_output1_dtype = np_to_trt_dtype(output1_dtype)
    trt_input_memory_format = input_memory_format
    trt_output_memory_format = output_memory_format

    # Create the model
    TRT_LOGGER = trt.Logger(trt.Logger.INFO)
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network()
    in0 = network.add_input("INPUT0", trt_input_dtype, input_shape)
    in1 = network.add_input("INPUT1", trt_input_dtype, input_shape)
    add = network.add_elementwise(in0, in1, trt.ElementWiseOperation.SUM)
    sub = network.add_elementwise(in0, in1, trt.ElementWiseOperation.SUB)

    out0 = network.add_identity(add.get_output(0))
    out1 = network.add_identity(sub.get_output(0))

    out0.get_output(0).name = "OUTPUT0"
    out1.get_output(0).name = "OUTPUT1"
    network.mark_output(out0.get_output(0))
    network.mark_output(out1.get_output(0))

    out0.get_output(0).dtype = trt_output0_dtype
    out1.get_output(0).dtype = trt_output1_dtype

    in0.allowed_formats = 1 << int(trt_input_memory_format)
    in1.allowed_formats = 1 << int(trt_input_memory_format)
    out0.get_output(0).allowed_formats = 1 << int(trt_output_memory_format)
    out1.get_output(0).allowed_formats = 1 << int(trt_output_memory_format)

    if (trt_input_dtype == trt.int8):
        in0.dynamic_range = (-128.0, 127.0)
        in1.dynamic_range = (-128.0, 127.0)
    if (trt_output0_dtype == trt.int8):
        out0.get_output(0).dynamic_range = (-128.0, 127.0)
    if (trt_output1_dtype == trt.int8):
        out1.get_output(0).dynamic_range = (-128.0, 127.0)

    flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
    datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype])
    for dt in datatype_set:
        if (dt == trt.int8):
            flags |= 1 << int(trt.BuilderFlag.INT8)
        elif (dt == trt.float16):
            flags |= 1 << int(trt.BuilderFlag.FP16)
    config = builder.create_builder_config()
    config.flags = flags
    config.max_workspace_size = 1 << 20
    builder.max_batch_size = max(1, max_batch)
    engine = builder.build_engine(network, config)

    base_name = "plan_nobatch" if max_batch == 0 else "plan"
    base_name += "_" + trt_format_to_string(
        input_memory_format) + "_" + trt_format_to_string(output_memory_format)
    model_name = tu.get_model_name(base_name, input_dtype, output0_dtype,
                                   output1_dtype)
    model_version_dir = models_dir + "/" + model_name + "/" + str(
        model_version)

    try:
        os.makedirs(model_version_dir)
    except OSError as ex:
        pass  # ignore existing dir

    with open(model_version_dir + "/model.plan", "wb") as f:
        f.write(engine.serialize())

    del engine
    del builder
Esempio n. 23
0
    def test_multi_batch_delayed_preferred_different_shape(self):
        # Send two requests with total static batch size in between
        # preferred sizes. Then send a request with a different shape
        # and a non-preferred batch size. Use
        # TRTSERVER_DELAY_SCHEDULER in the environment so that
        # requests can be queued up before scheduler starts
        # servicing. This should cause the first two requests to be
        # immediately responded to. Send a forth request with the same
        # shape as the third that causes a preferred size so that
        # third and forth response are sent immediately.
        for trial in _trials:
            try:
                url = "localhost:8000"
                protocol = ProtocolType.HTTP
                model_name = tu.get_model_name(trial, np.float32, np.float32,
                                               np.float32)

                self.check_setup(url, protocol, model_name)

                # Need scheduler to wait for queue to contain 4 requests
                self.assertTrue("TRTSERVER_DELAY_SCHEDULER" in os.environ)
                self.assertEqual(int(os.environ["TRTSERVER_DELAY_SCHEDULER"]),
                                 4)

                threads = []
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 1, (3000, None)),
                                     kwargs={
                                         'shm_region_names':
                                         ['ip00', 'ip01', 'op00', 'op01']
                                     }))
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 3, (3000, None)),
                                     kwargs={
                                         'shm_region_names':
                                         ['ip10', 'ip11', 'op10', 'op11']
                                     }))
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 1, (3000, None)),
                                     kwargs={
                                         'input_size':
                                         8,
                                         'shm_region_names':
                                         ['ip20', 'ip21', 'op20', 'op21']
                                     }))
                threads.append(
                    threading.Thread(target=self.check_response,
                                     args=(trial, 5, (3000, None)),
                                     kwargs={
                                         'input_size':
                                         8,
                                         'shm_region_names':
                                         ['ip30', 'ip31', 'op30', 'op31']
                                     }))
                threads[0].start()
                threads[1].start()
                time.sleep(1)
                threads[2].start()
                threads[3].start()
                for t in threads:
                    t.join()
                self.check_deferred_exception()
                self.check_status(url, protocol, model_name, (1, 3, 5), 2, 10)
            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))
def create_plan_modelconfig(models_dir, max_batch, model_version, input_shape,
                            output0_shape, output1_shape, input_dtype,
                            output0_dtype, output1_dtype, input_memory_format,
                            output_memory_format, version_policy):

    if not tu.validate_for_trt_model(input_dtype, output0_dtype, output1_dtype,
                                     input_shape, output0_shape,
                                     output1_shape):
        return

    # Unpack version policy
    version_policy_str = "{ latest { num_versions: 1 }}"
    if version_policy is not None:
        type, val = version_policy
        if type == 'latest':
            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(
                val)
        elif type == 'specific':
            version_policy_str = "{{ specific {{ versions: {} }}}}".format(val)
        else:
            version_policy_str = "{ all { }}"

    # Use a different model name for different kinds of models
    base_name = "plan_nobatch" if max_batch == 0 else "plan"
    base_name += "_" + trt_format_to_string(
        input_memory_format) + "_" + trt_format_to_string(output_memory_format)
    model_name = tu.get_model_name(base_name, input_dtype, output0_dtype,
                                   output1_dtype)

    config_dir = models_dir + "/" + model_name
    if -1 in input_shape:
        profile_index = 0
        config = '''
name: "{}"
platform: "tensorrt_plan"
max_batch_size: {}
version_policy: {}
input [
  {{
    name: "INPUT0"
    data_type: {}
    dims: [ {} ]
  }},
  {{
    name: "INPUT1"
    data_type: {}
    dims: [ {} ]
  }}
]
output [
  {{
    name: "OUTPUT0"
    data_type: {}
    dims: [ {} ]
   }},
  {{
    name: "OUTPUT1"
    data_type: {}
    dims: [ {} ]
  }}
]
instance_group [
  {{
      profile:"{}"
  }}
]
'''.format(model_name, max_batch, version_policy_str,
           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
           np_to_model_dtype(output0_dtype),
           tu.shape_to_dims_str(output0_shape),
           np_to_model_dtype(output1_dtype),
           tu.shape_to_dims_str(output1_shape), profile_index)
    else:
        config = '''
name: "{}"
platform: "tensorrt_plan"
max_batch_size: {}
version_policy: {}
input [
  {{
    name: "INPUT0"
    data_type: {}
    dims: [ {} ]
  }},
  {{
    name: "INPUT1"
    data_type: {}
    dims: [ {} ]
  }}
]
output [
  {{
    name: "OUTPUT0"
    data_type: {}
    dims: [ {} ]
   }},
  {{
    name: "OUTPUT1"
    data_type: {}
    dims: [ {} ]
  }}
]
'''.format(model_name, max_batch, version_policy_str,
           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
           np_to_model_dtype(output0_dtype),
           tu.shape_to_dims_str(output0_shape),
           np_to_model_dtype(output1_dtype),
           tu.shape_to_dims_str(output1_shape))

    try:
        os.makedirs(config_dir)
    except OSError as ex:
        pass  # ignore existing dir

    with open(config_dir + "/config.pbtxt", "w") as cfile:
        cfile.write(config)
def create_plan_dynamic_modelfile(models_dir,
                                  max_batch,
                                  model_version,
                                  input_shape,
                                  output0_shape,
                                  output1_shape,
                                  input_dtype,
                                  output0_dtype,
                                  output1_dtype,
                                  input_memory_format,
                                  output_memory_format,
                                  min_dim=1,
                                  max_dim=64):
    trt_input_dtype = np_to_trt_dtype(input_dtype)
    trt_output0_dtype = np_to_trt_dtype(output0_dtype)
    trt_output1_dtype = np_to_trt_dtype(output1_dtype)
    trt_input_memory_format = input_memory_format
    trt_output_memory_format = output_memory_format

    # Create the model
    TRT_LOGGER = trt.Logger(trt.Logger.INFO)
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    if max_batch == 0:
        input_with_batchsize = [i for i in input_shape]
    else:
        input_with_batchsize = [-1] + [i for i in input_shape]

    in0 = network.add_input("INPUT0", trt_input_dtype, input_with_batchsize)
    in1 = network.add_input("INPUT1", trt_input_dtype, input_with_batchsize)
    add = network.add_elementwise(in0, in1, trt.ElementWiseOperation.SUM)
    sub = network.add_elementwise(in0, in1, trt.ElementWiseOperation.SUB)

    out0 = network.add_identity(add.get_output(0))
    out1 = network.add_identity(sub.get_output(0))

    out0.get_output(0).name = "OUTPUT0"
    out1.get_output(0).name = "OUTPUT1"
    network.mark_output(out0.get_output(0))
    network.mark_output(out1.get_output(0))

    out0.get_output(0).dtype = trt_output0_dtype
    out1.get_output(0).dtype = trt_output1_dtype

    in0.allowed_formats = 1 << int(trt_input_memory_format)
    in1.allowed_formats = 1 << int(trt_input_memory_format)
    out0.get_output(0).allowed_formats = 1 << int(trt_output_memory_format)
    out1.get_output(0).allowed_formats = 1 << int(trt_output_memory_format)

    if (trt_input_dtype == trt.int8):
        in0.dynamic_range = (-128.0, 127.0)
        in1.dynamic_range = (-128.0, 127.0)
    if (trt_output0_dtype == trt.int8):
        out0.get_output(0).dynamic_range = (-128.0, 127.0)
    if (trt_output1_dtype == trt.int8):
        out1.get_output(0).dynamic_range = (-128.0, 127.0)

    min_shape = []
    opt_shape = []
    max_shape = []
    if max_batch != 0:
        min_shape = min_shape + [1]
        opt_shape = opt_shape + [max(1, max_batch)]
        max_shape = max_shape + [max(1, max_batch)]
    for i in input_shape:
        if i == -1:
            min_shape = min_shape + [min_dim]
            opt_shape = opt_shape + [int((max_dim + min_dim) / 2)]
            max_shape = max_shape + [max_dim]
        else:
            min_shape = min_shape + [i]
            opt_shape = opt_shape + [i]
            max_shape = max_shape + [i]

    profile = builder.create_optimization_profile()
    profile.set_shape("INPUT0", min_shape, opt_shape, max_shape)
    profile.set_shape("INPUT1", min_shape, opt_shape, max_shape)
    flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
    datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype])
    for dt in datatype_set:
        if (dt == trt.int8):
            flags |= 1 << int(trt.BuilderFlag.INT8)
        elif (dt == trt.float16):
            flags |= 1 << int(trt.BuilderFlag.FP16)
    config = builder.create_builder_config()
    config.flags = flags
    config.add_optimization_profile(profile)
    config.max_workspace_size = 1 << 20
    engine = builder.build_engine(network, config)

    # Use a different model name for different kinds of models
    base_name = "plan_nobatch" if max_batch == 0 else "plan"
    base_name += "_" + trt_format_to_string(
        input_memory_format) + "_" + trt_format_to_string(output_memory_format)
    model_name = tu.get_model_name(base_name, input_dtype, output0_dtype,
                                   output1_dtype)

    model_version_dir = models_dir + "/" + model_name + "/" + str(
        model_version)

    try:
        os.makedirs(model_version_dir)
    except OSError as ex:
        pass  # ignore existing dir

    with open(model_version_dir + "/model.plan", "wb") as f:
        f.write(engine.serialize())

    del engine
    del builder