def test_multi_batch_not_preferred_different_shape(self): # Send two requests with total static batch size in between # preferred sizes. Then send a request with a different shape # and a non-preferred batch size. This should cause the first # two requests to be immediately responded to and the third # response to be delayed by the max batch queue delay. for trial in _trials: try: url = "localhost:8000" protocol = ProtocolType.HTTP model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) self.check_setup(url, protocol, model_name) self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) threads = [] threads.append( threading.Thread(target=self.check_response, args=(trial, 1, (3000, None)))) threads.append( threading.Thread(target=self.check_response, args=(trial, 3, (3000, None)))) threads.append( threading.Thread(target=self.check_response, args=(trial, 1, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms)), kwargs={'input_size': 8})) threads[0].start() threads[1].start() time.sleep(1) threads[2].start() for t in threads: t.join() self.check_deferred_exception() self.check_status(url, protocol, model_name, (1, 3), 2, 5) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_multi_batch_delayed_sum_gt_max_preferred(self): # Send two requests with first not having preferred size and # second being smaller than max preferred size but the sum of # the requests being larger than max preferred size. Use # TRTSERVER_DELAY_SCHEDULER in the environment so that # requests can be queued up before scheduler starts # servicing. This should cause first response to be returned # immediately but the second response, since it alone is not # greater than max preferred size, will be delayed. for trial in _trials: try: url = "localhost:8000" protocol = ProtocolType.HTTP model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) self.check_setup(url, protocol, model_name) # Need scheduler to wait for queue to contain 2 requests self.assertTrue("TRTSERVER_DELAY_SCHEDULER" in os.environ) self.assertEqual(int(os.environ["TRTSERVER_DELAY_SCHEDULER"]), 2) threads = [] threads.append( threading.Thread(target=self.check_response, args=(trial, 3, True, 3000))) threads.append( threading.Thread(target=self.check_response, args=(trial, 4, False, _max_queue_delay))) threads[0].start() time.sleep(1) threads[1].start() for t in threads: t.join() self.check_deferred_exception() self.check_status(url, protocol, model_name, (3, 4), 2, 7) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_multi_batch_use_biggest_preferred(self): # Send multiple requests that sum to multiple preferred sizes # and make sure the largest preferred size if used for the # batch. Requires TRTSERVER_DELAY_SCHEDULER in the environment # so that requests can be queued up before scheduler starts # servicing. for trial in _trials: try: url = "localhost:8000" protocol = ProtocolType.HTTP model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) self.check_setup(url, protocol, model_name) # Need scheduler to wait for queue to contain 6 request self.assertTrue("TRTSERVER_DELAY_SCHEDULER" in os.environ) self.assertEqual(int(os.environ["TRTSERVER_DELAY_SCHEDULER"]), 6) threads = [] threads.append(threading.Thread(target=self.check_response, args=(trial, 1, True, 3000))) threads.append(threading.Thread(target=self.check_response, args=(trial, 1, True, 3000))) threads.append(threading.Thread(target=self.check_response, args=(trial, 1, True, 3000))) threads.append(threading.Thread(target=self.check_response, args=(trial, 1, True, 3000))) threads.append(threading.Thread(target=self.check_response, args=(trial, 1, True, 3000))) threads.append(threading.Thread(target=self.check_response, args=(trial, 1, True, 3000))) for t in threads: t.start() for t in threads: t.join() self.check_deferred_exception() self.check_status(url, protocol, model_name, (1,), 1, 6) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_parse_error_modelfail(self): # --strict-readiness=true so server is live but not ready input_size = 16 tensor_shape = (input_size,) # Server was started but with a model that fails to load try: for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: model_name = tu.get_model_name('graphdef', np.float32, np.float32, np.float32) ctx = ServerStatusContext(pair[0], pair[1], model_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) uptime = ss.uptime_ns self.assertGreater(uptime, 0) self.assertEqual(len(ss.model_status), 1) self.assertTrue(model_name in ss.model_status, "expected status for model " + model_name) for (k, v) in iteritems(ss.model_status[model_name].version_status): self.assertEqual(v.ready_state, server_status.MODEL_UNAVAILABLE) hctx = ServerHealthContext(pair[0], pair[1], True) self.assertFalse(hctx.is_ready()) self.assertTrue(hctx.is_live()) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) try: iu.infer_exact(self, 'graphdef', tensor_shape, 1, np.float32, np.float32, np.float32) self.assertTrue(False, "expected error for unavailable model " + model_name) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertTrue( ex.message().startswith( "Inference request for unknown model 'graphdef_float32_float32_float32'"))
def test_batch_request_for_nobatching_model(self): input_size = 16 # graphdef_nobatch_int32_int8_int8 is non batching version. # The server should return an error if the batch size dimension # is included in the shape tensor_shape = (1, input_size) for protocol in ["http", "grpc"]: model_name = tu.get_model_name("graphdef_nobatch", np.int32, np.int8, np.int8) in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) inputs = [] outputs = [] if protocol == "http": triton_client = tritonhttpclient.InferenceServerClient(url='localhost:8000', verbose=True) inputs.append(tritonhttpclient.InferInput('INPUT0', tensor_shape , "INT32")) inputs.append(tritonhttpclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0')) outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1')) else: triton_client = tritongrpcclient.InferenceServerClient(url='localhost:8001', verbose=True) inputs.append(tritongrpcclient.InferInput('INPUT0', tensor_shape , "INT32")) inputs.append(tritongrpcclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0')) outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1')) # Initialize the data inputs[0].set_data_from_numpy(in0) inputs[1].set_data_from_numpy(in1) try: results = triton_client.infer(model_name, inputs, outputs=outputs) self.assertTrue(False, "expected failure with batched request for non-batching model") except InferenceServerException as ex: pass
def test_multi_batch_use_best_preferred(self): # Send multiple requests where the initial ones sum to a # preferred size and then extra request goes beyond that. The # initial requests should be handled immediately at the # preferred batch size and then the other one after # timeout. Use TRTSERVER_DELAY_SCHEDULER in the environment so # that requests can be queued up before scheduler starts # servicing. for trial in _trials: try: url = "localhost:8000" protocol = ProtocolType.HTTP model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) self.check_setup(url, protocol, model_name) # Need scheduler to wait for queue to contain 3 requests self.assertTrue("TRTSERVER_DELAY_SCHEDULER" in os.environ) self.assertEqual(int(os.environ["TRTSERVER_DELAY_SCHEDULER"]), 3) threads = [] threads.append(threading.Thread(target=self.check_response, args=(trial, 1, (3000, None)))) threads.append(threading.Thread(target=self.check_response, args=(trial, 1, (3000, None)))) threads.append(threading.Thread(target=self.check_response, args=(trial, 1, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms)))) threads[0].start() threads[1].start() time.sleep(1) threads[2].start() for t in threads: t.join() self.check_deferred_exception() self.check_status(url, protocol, model_name, (1,), 2, 3) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_multi_batch_sum_gt_max_preferred(self): # Send two requests with first not having preferred size and # second being smaller than max preferred size but the sum of # the requests being larger than max preferred size. Delay the # second request so that it arrives after the first is already # be processed by the dynamic batcher. This should cause first # response to be returned immediately but the second response, # since it alone is not greater than max preferred size, will # be delayed. for trial in _trials: try: url = "localhost:8000" protocol = ProtocolType.HTTP model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) self.check_setup(url, protocol, model_name) self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) threads = [] threads.append( threading.Thread(target=self.check_response, args=(trial, 3, (3000, None)))) threads.append( threading.Thread(target=self.check_response, args=(trial, 4, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms)))) threads[0].start() time.sleep(1) threads[1].start() for t in threads: t.join() self.check_deferred_exception() self.check_status(url, protocol, model_name, (3, 4), 2, 7) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_multi_batch_different_shape(self): # Send two requests with sum of static batch sizes == # preferred size, but with different shapes (using model with # variable-size tensors). This should cause the requests to # not be batched. The first response will come back # immediately and the second delayed by the max batch queue # delay for trial in _trials: try: url = "localhost:8000" protocol = ProtocolType.HTTP model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) self.check_setup(url, protocol, model_name) self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) threads = [] threads.append( threading.Thread(target=self.check_response, args=(trial, 1, (3000, None)), kwargs={'input_size': 16})) threads.append( threading.Thread(target=self.check_response, args=(trial, 1, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms)), kwargs={'input_size': 8})) threads[0].start() time.sleep(1) threads[1].start() for t in threads: t.join() self.check_deferred_exception() self.check_status(url, protocol, model_name, (1, ), 2, 2) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_bs2_request_for_non_batching_model(self): input_size = 16 tensor_shape = (input_size, ) # graphdef_int32_int8_int8 has a non-batching version. If we # make a batch-size two (or greater) request for that model it # should fail. for protocol, url in ((ProtocolType.HTTP, 'localhost:8000'), (ProtocolType.GRPC, 'localhost:8001')): model_name = tu.get_model_name("graphdef_nobatch", np.int32, np.int8, np.int8) in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) try: ctx = InferContext(url, protocol, model_name, None, True) results = ctx.run({ 'INPUT0': (in0, ), 'INPUT1': (in1, ) }, { 'OUTPUT0': InferContext.ResultFormat.RAW, 'OUTPUT1': InferContext.ResultFormat.RAW }, 2) self.assertTrue( False, "expected failure with batch-size 2 for non-batching model" ) except InferenceServerException as ex: pass
def _erroneous_infer(self, tensor_shape, batch_size): import tritonhttpclient item_size = batch_size for dim in tensor_shape: item_size *= dim full_shape = (batch_size, ) + tensor_shape input_np = np.arange(item_size, dtype=self.dtype_).reshape(full_shape) expected_output0_np = input_np + input_np expected_output1_np = input_np - input_np inputs = [] inputs.append( tritonhttpclient.InferInput('INPUT0', full_shape, self.dtype_str_)) inputs[-1].set_data_from_numpy(input_np) inputs.append( tritonhttpclient.InferInput('INPUT1', full_shape, self.dtype_str_)) inputs[-1].set_data_from_numpy(input_np) outputs = [] outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) model_name = tu.get_model_name(self.model_name_, self.dtype_, self.dtype_, self.dtype_) results = tritonhttpclient.InferenceServerClient( "localhost:8000", verbose=True).infer(model_name=model_name, inputs=inputs, outputs=outputs) # Validate the results by comparing with precomputed values. output0_np = results.as_numpy('OUTPUT0') output1_np = results.as_numpy('OUTPUT1') self.assertFalse(np.array_equal(output0_np, expected_output0_np), "expects OUTPUT0 is not correct") self.assertFalse(np.array_equal(output1_np, expected_output1_np), "expects OUTPUT1 is not correct")
def test_multi_batch_not_preferred(self): # Send two requests with total static batch size in between # preferred sizes. This should cause the first response to be # delayed by the max batch queue delay, and the second by max # delay (minus the difference in time that they arrived in the # queue) for trial in _trials: try: url = "localhost:8000" protocol = ProtocolType.HTTP model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) self.check_setup(url, protocol, model_name) self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) threads = [] threads.append( threading.Thread(target=self.check_response, args=(trial, 1, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms)))) threads.append( threading.Thread(target=self.check_response, args=(trial, 3, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms - 2000)))) threads[0].start() time.sleep(1) threads[1].start() for t in threads: t.join() self.check_deferred_exception() self.check_status(url, protocol, model_name, (1, 3), 1, 4) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def infer_exact(tester, pf, tensor_shape, batch_size, input_dtype, output0_dtype, output1_dtype, output0_raw=True, output1_raw=True, model_version=None, swap=False, outputs=("OUTPUT0", "OUTPUT1"), use_http=True, use_grpc=True, skip_request_id_check=False, use_streaming=True, correlation_id=0, shm_region_names=None, precreated_shm_regions=None, use_system_shared_memory=False, use_cuda_shared_memory=False): tester.assertTrue(use_http or use_grpc or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", ProtocolType.HTTP, False)) if use_grpc: configs.append(("localhost:8001", ProtocolType.GRPC, False)) if use_streaming: configs.append(("localhost:8001", ProtocolType.GRPC, True)) # outputs are sum and difference of inputs so set max input # values so that they will not overflow the output. This # allows us to do an exact match. For float types use 8, 16, # 32 int range for fp 16, 32, 64 respectively. When getting # class outputs the result value/probability is returned as a # float so must use fp32 range in that case. rinput_dtype = _range_repr_dtype(input_dtype) routput0_dtype = _range_repr_dtype( output0_dtype if output0_raw else np.float32) routput1_dtype = _range_repr_dtype( output1_dtype if output1_raw else np.float32) val_min = max( np.iinfo(rinput_dtype).min, np.iinfo(routput0_dtype).min, np.iinfo(routput1_dtype).min) / 2 val_max = min( np.iinfo(rinput_dtype).max, np.iinfo(routput0_dtype).max, np.iinfo(routput1_dtype).max) / 2 num_classes = 3 input0_list = list() input1_list = list() expected0_list = list() expected1_list = list() expected0_val_list = list() expected1_val_list = list() for b in range(batch_size): in0 = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) in1 = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) if input_dtype != np.object: in0 = in0.astype(input_dtype) in1 = in1.astype(input_dtype) if not swap: op0 = in0 + in1 op1 = in0 - in1 else: op0 = in0 - in1 op1 = in0 + in1 expected0_val_list.append(op0) expected1_val_list.append(op1) if output0_dtype == np.object: expected0_list.append( np.array([ unicode(str(x), encoding='utf-8') for x in (op0.flatten()) ], dtype=object).reshape(op0.shape)) else: expected0_list.append(op0.astype(output0_dtype)) if output1_dtype == np.object: expected1_list.append( np.array([ unicode(str(x), encoding='utf-8') for x in (op1.flatten()) ], dtype=object).reshape(op1.shape)) else: expected1_list.append(op1.astype(output1_dtype)) if input_dtype == np.object: in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(in0.shape) in1n = np.array([str(x) for x in in1.reshape(in1.size)], dtype=object) in1 = in1n.reshape(in1.shape) input0_list.append(in0) input1_list.append(in1) # prepend size of string to string input string data if input_dtype == np.object: input0_list_tmp = _prepend_string_size(input0_list) input1_list_tmp = _prepend_string_size(input1_list) else: input0_list_tmp = input0_list input1_list_tmp = input1_list input0_byte_size = sum([i0.nbytes for i0 in input0_list]) input1_byte_size = sum([i1.nbytes for i1 in input1_list]) if output0_dtype == np.object: expected0_list_tmp = _prepend_string_size(expected0_list) else: expected0_list_tmp = expected0_list if output1_dtype == np.object: expected1_list_tmp = _prepend_string_size(expected1_list) else: expected1_list_tmp = expected1_list # Create and register system/cuda shared memory regions if needed shm_handles = su.create_register_set_shm_regions( input0_list_tmp, input1_list_tmp, expected0_list_tmp, expected1_list_tmp, outputs, shm_region_names, precreated_shm_regions, use_system_shared_memory, use_cuda_shared_memory) # Run inference and check results for each config for config in configs: model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype) ctx = InferContext(config[0], config[1], model_name, model_version, correlation_id=correlation_id, streaming=config[2], verbose=True) expected0_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in expected0_val_list ] expected1_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in expected1_val_list ] output_req = {} OUTPUT0 = "OUTPUT0" OUTPUT1 = "OUTPUT1" INPUT0 = "INPUT0" INPUT1 = "INPUT1" if pf == "libtorch" or pf == "libtorch_nobatch": OUTPUT0 = "OUTPUT__0" OUTPUT1 = "OUTPUT__1" INPUT0 = "INPUT__0" INPUT1 = "INPUT__1" i = 0 if "OUTPUT0" in outputs: if len(shm_handles) != 0: output_req[OUTPUT0] = (InferContext.ResultFormat.RAW, shm_handles[2]) else: if output0_raw: output_req[OUTPUT0] = InferContext.ResultFormat.RAW else: output_req[OUTPUT0] = (InferContext.ResultFormat.CLASS, num_classes) i += 1 if "OUTPUT1" in outputs: if len(shm_handles) != 0: output_req[OUTPUT1] = (InferContext.ResultFormat.RAW, shm_handles[2 + i]) else: if output1_raw: output_req[OUTPUT1] = InferContext.ResultFormat.RAW else: output_req[OUTPUT1] = (InferContext.ResultFormat.CLASS, num_classes) if len(shm_handles) != 0: results = ctx.run( { INPUT0: (shm_handles[0], tensor_shape), INPUT1: (shm_handles[1], tensor_shape) }, output_req, batch_size) else: results = ctx.run({ INPUT0: input0_list, INPUT1: input1_list }, output_req, batch_size) if not skip_request_id_check: global _seen_request_ids request_id = ctx.get_last_request_id() tester.assertFalse(request_id in _seen_request_ids, "request_id: {}".format(request_id)) _seen_request_ids.add(request_id) tester.assertEqual(ctx.get_last_request_model_name(), model_name) if model_version is not None: tester.assertEqual(ctx.get_last_request_model_version(), model_version) tester.assertEqual(len(results), len(outputs)) for (result_name, result_val) in iteritems(results): for b in range(batch_size): if ((result_name == OUTPUT0 and output0_raw) or (result_name == OUTPUT1 and output1_raw)): if result_name == OUTPUT0: tester.assertTrue( np.array_equal(result_val[b], expected0_list[b]), "{}, {} expected: {}, got {}".format( model_name, OUTPUT0, expected0_list[b], result_val[b])) elif result_name == OUTPUT1: tester.assertTrue( np.array_equal(result_val[b], expected1_list[b]), "{}, {} expected: {}, got {}".format( model_name, OUTPUT1, expected1_list[b], result_val[b])) else: tester.assertTrue( False, "unexpected raw result {}".format(result_name)) else: # num_classes values must be returned and must # match expected top values class_list = result_val[b] tester.assertEqual(len(class_list), num_classes) expected0_flatten = expected0_list[b].flatten() expected1_flatten = expected1_list[b].flatten() for idx, ctuple in enumerate(class_list): if result_name == OUTPUT0: # can't compare indices since could have # different indices with the same # value/prob, so compare that the value of # each index equals the expected # value. Can only compare labels when the # indices are equal. tester.assertEqual(ctuple[1], expected0_flatten[ctuple[0]]) tester.assertEqual( ctuple[1], expected0_flatten[expected0_sort_idx[b][idx]]) if ctuple[0] == expected0_sort_idx[b][idx]: tester.assertEqual( ctuple[2], 'label{}'.format( expected0_sort_idx[b][idx])) elif result_name == OUTPUT1: tester.assertEqual(ctuple[1], expected1_flatten[ctuple[0]]) tester.assertEqual( ctuple[1], expected1_flatten[expected1_sort_idx[b][idx]]) else: tester.assertTrue( False, "unexpected class result {}".format( result_name)) # Unregister system/cuda shared memory regions if they exist su.unregister_cleanup_shm_regions(shm_handles, precreated_shm_regions, outputs, use_system_shared_memory, use_cuda_shared_memory) return results
def test_dynamic_model_modify(self): input_size = 16 models_base = ('savedmodel', 'plan') models_shape = ((input_size,), (input_size, 1, 1)) models = list() for m in models_base: models.append(tu.get_model_name(m, np.float32, np.float32, np.float32)) # Make sure savedmodel and plan are in the status for model_name in models: try: for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], model_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(model_name in ss.model_status, "expected status for model " + model_name) for (k, v) in iteritems(ss.model_status[model_name].version_status): self.assertEqual(v.ready_state, server_status.MODEL_READY) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Run inference on the model, both versions 1 and 3 for version in (1, 3): for model_name, model_shape in zip(models_base, models_shape): try: iu.infer_exact(self, model_name, model_shape, 1, np.float32, np.float32, np.float32, swap=(version == 3), model_version=version) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Change the model configuration to have the default version # policy (so that only version 3) if available. for base_name, model_name in zip(models_base, models): shutil.copyfile("config.pbtxt." + base_name, "models/" + model_name + "/config.pbtxt") time.sleep(5) # wait for models to reload for model_name in models: try: for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], model_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(model_name in ss.model_status, "expected status for model " + model_name) self.assertTrue(1 in ss.model_status[model_name].version_status, "expected status for version 1 of model " + model_name) self.assertTrue(3 in ss.model_status[model_name].version_status, "expected status for version 3 of model " + model_name) self.assertEqual(ss.model_status[model_name].version_status[1].ready_state, server_status.MODEL_UNAVAILABLE) self.assertEqual(ss.model_status[model_name].version_status[3].ready_state, server_status.MODEL_READY) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Attempt inferencing using version 1, should fail since # change in model policy makes that no longer available. for model_name, model_shape in zip(models_base, models_shape): try: iu.infer_exact(self, model_name, model_shape, 1, np.float32, np.float32, np.float32, swap=False, model_version=1) self.assertTrue(False, "expected error for unavailable model " + model_name) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertTrue( ex.message().startswith("Inference request for unknown model")) # Version 3 should continue to work... for model_name, model_shape in zip(models_base, models_shape): try: iu.infer_exact(self, model_name, model_shape, 1, np.float32, np.float32, np.float32, swap=True, model_version=3) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_dynamic_version_load_unload_disabled(self): input_size = 16 tensor_shape = (input_size,) graphdef_name = tu.get_model_name('graphdef', np.int32, np.int32, np.int32) # Add a new version to the model store and give it time to # load. But it shouldn't load because dynamic loading is # disabled. try: shutil.copytree("models/" + graphdef_name + "/2", "models/" + graphdef_name + "/7") time.sleep(5) # wait for model to load for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(graphdef_name in ss.model_status, "expected status for model " + graphdef_name) self.assertFalse(7 in ss.model_status[graphdef_name].version_status, "unexpected status for version 7 of model " + graphdef_name) self.assertEqual(len(ss.model_status[graphdef_name].version_status), 3) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Remove one of the original versions from the model # store. Unloading is disabled so it should remain available # in the status. try: shutil.rmtree("models/" + graphdef_name + "/1") time.sleep(5) # wait for version to unload (but it shouldn't) for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(graphdef_name in ss.model_status, "expected status for model " + graphdef_name) self.assertTrue(1 in ss.model_status[graphdef_name].version_status, "expected status for version 1 of model " + graphdef_name) self.assertEqual(len(ss.model_status[graphdef_name].version_status), 3) for (k, v) in iteritems(ss.model_status[graphdef_name].version_status): self.assertEqual(v.ready_state, server_status.MODEL_READY) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Run inference to make sure model still being served even # though version deleted from model store try: iu.infer_exact(self, 'graphdef', tensor_shape, 1, np.int32, np.int32, np.int32, swap=False, model_version=1) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_dynamic_version_load_unload(self): input_size = 16 tensor_shape = (input_size,) graphdef_name = tu.get_model_name('graphdef', np.int32, np.int32, np.int32) # There are 3 versions. Make sure that all have status and are # ready. try: for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(graphdef_name in ss.model_status, "expected status for model " + graphdef_name) self.assertEqual(len(ss.model_status[graphdef_name].version_status), 3) for (k, v) in iteritems(ss.model_status[graphdef_name].version_status): self.assertEqual(v.ready_state, server_status.MODEL_READY) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Run inference on version 1 to make sure it is available try: iu.infer_exact(self, 'graphdef', tensor_shape, 1, np.int32, np.int32, np.int32, swap=False, model_version=1) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Make sure version 1 has execution stats in the status. expected_exec_cnt = 0 try: for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(graphdef_name in ss.model_status, "expected status for model " + graphdef_name) self.assertTrue(1 in ss.model_status[graphdef_name].version_status, "expected status for version 1 of model " + graphdef_name) version_status = ss.model_status[graphdef_name].version_status[1] self.assertEqual(version_status.ready_state, server_status.MODEL_READY) self.assertGreater(version_status.model_execution_count, 0) expected_exec_cnt = version_status.model_execution_count except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Remove version 1 from the model store and give it time to # unload. Make sure that it has a status but is unavailable. try: shutil.rmtree("models/" + graphdef_name + "/1") time.sleep(5) # wait for version to unload for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(graphdef_name in ss.model_status, "expected status for model " + graphdef_name) self.assertTrue(1 in ss.model_status[graphdef_name].version_status, "expected status for version 1 of model " + graphdef_name) version_status = ss.model_status[graphdef_name].version_status[1] self.assertEqual(version_status.ready_state, server_status.MODEL_UNAVAILABLE) self.assertEqual(version_status.model_execution_count, expected_exec_cnt) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Version is removed so inference should fail try: iu.infer_exact(self, 'graphdef', tensor_shape, 1, np.int32, np.int32, np.int32, swap=False, model_version=1) self.assertTrue(False, "expected error for unavailable model " + graphdef_name) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertTrue( ex.message().startswith( "Inference request for unknown model 'graphdef_int32_int32_int32'")) # Add back the same version. The status/stats should be # retained for versions (note that this is different behavior # than if a model is removed and then added back). try: shutil.copytree("models/" + graphdef_name + "/2", "models/" + graphdef_name + "/1") time.sleep(5) # wait for model to load for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(graphdef_name in ss.model_status, "expected status for model " + graphdef_name) self.assertEqual(len(ss.model_status[graphdef_name].version_status), 3) for (k, v) in iteritems(ss.model_status[graphdef_name].version_status): self.assertEqual(v.ready_state, server_status.MODEL_READY) if k == 1: self.assertEqual(v.model_execution_count, expected_exec_cnt) else: self.assertEqual(v.model_execution_count, 0) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Add another version from the model store. try: shutil.copytree("models/" + graphdef_name + "/2", "models/" + graphdef_name + "/7") time.sleep(5) # wait for version to load for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(graphdef_name in ss.model_status, "expected status for model " + graphdef_name) self.assertTrue(7 in ss.model_status[graphdef_name].version_status, "expected status for version 7 of model " + graphdef_name) self.assertEqual(len(ss.model_status[graphdef_name].version_status), 4) for (k, v) in iteritems(ss.model_status[graphdef_name].version_status): self.assertEqual(v.ready_state, server_status.MODEL_READY) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_dynamic_model_load_unload_disabled(self): input_size = 16 tensor_shape = (input_size,) savedmodel_name = tu.get_model_name('savedmodel', np.float32, np.float32, np.float32) netdef_name = tu.get_model_name('netdef', np.float32, np.float32, np.float32) # Make sure savedmodel model is not in the status (because # initially it is not in the model store) try: for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) ss = ctx.get_server_status() self.assertTrue(False, "expected status failure for " + savedmodel_name) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertGreater(ex.request_id(), 0) self.assertTrue( ex.message().startswith("no status available for unknown model")) # Add savedmodel model to the model store and give it time to # load. But it shouldn't load because dynamic loading is disabled. try: shutil.copytree(savedmodel_name, "models/" + savedmodel_name) time.sleep(5) # wait for model to load for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) ss = ctx.get_server_status() self.assertTrue(False, "expected status failure for " + savedmodel_name) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertGreater(ex.request_id(), 0) self.assertTrue( ex.message().startswith("no status available for unknown model")) # Run inference which should fail because the model isn't there try: iu.infer_exact(self, 'savedmodel', tensor_shape, 1, np.float32, np.float32, np.float32, swap=True) self.assertTrue(False, "expected error for unavailable model " + savedmodel_name) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertGreater(ex.request_id(), 0) self.assertTrue( ex.message().startswith("no status available for unknown model")) # Remove one of the original models from the model # store. Unloading is disabled so it should remain available # in the status. try: shutil.rmtree("models/" + netdef_name) time.sleep(5) # wait for model to unload (but it shouldn't) for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], netdef_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(netdef_name in ss.model_status, "expected status for model " + netdef_name) self.assertTrue(3 in ss.model_status[netdef_name].version_status, "expected status for version 3 of model " + netdef_name) version_status = ss.model_status[netdef_name].version_status[3] self.assertEqual(version_status.ready_state, server_status.MODEL_READY) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Run inference to make sure model still being served even # though deleted from model store try: iu.infer_exact(self, 'netdef', tensor_shape, 1, np.float32, np.float32, np.float32, swap=True) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_dynamic_model_load_unload(self): input_size = 16 tensor_shape = (input_size,) savedmodel_name = tu.get_model_name('savedmodel', np.float32, np.float32, np.float32) netdef_name = tu.get_model_name('netdef', np.float32, np.float32, np.float32) # Make sure savedmodel model is not in the status (because # initially it is not in the model store) try: for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) ss = ctx.get_server_status() self.assertTrue(False, "expected status failure for " + savedmodel_name) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertTrue( ex.message().startswith("no status available for unknown model")) # Add savedmodel model to the model store and give it time to # load. Make sure that it has a status and is ready. try: shutil.copytree(savedmodel_name, "models/" + savedmodel_name) time.sleep(5) # wait for model to load for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(savedmodel_name in ss.model_status, "expected status for model " + savedmodel_name) for (k, v) in iteritems(ss.model_status[savedmodel_name].version_status): self.assertEqual(v.ready_state, server_status.MODEL_READY) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Run inference on the just loaded model try: iu.infer_exact(self, 'savedmodel', tensor_shape, 1, np.float32, np.float32, np.float32, swap=True) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Make sure savedmodel has execution stats in the status. expected_exec_cnt = 0 try: for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(savedmodel_name in ss.model_status, "expected status for model " + savedmodel_name) self.assertTrue(3 in ss.model_status[savedmodel_name].version_status, "expected status for version 3 of model " + savedmodel_name) version_status = ss.model_status[savedmodel_name].version_status[3] self.assertEqual(version_status.ready_state, server_status.MODEL_READY) self.assertGreater(version_status.model_execution_count, 0) expected_exec_cnt = version_status.model_execution_count except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Remove savedmodel model from the model store and give it # time to unload. Make sure that it has a status but is # unavailable. try: shutil.rmtree("models/" + savedmodel_name) time.sleep(5) # wait for model to unload for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(savedmodel_name in ss.model_status, "expected status for model " + savedmodel_name) self.assertTrue(3 in ss.model_status[savedmodel_name].version_status, "expected status for version 3 of model " + savedmodel_name) version_status = ss.model_status[savedmodel_name].version_status[3] self.assertEqual(version_status.ready_state, server_status.MODEL_UNAVAILABLE) self.assertEqual(version_status.model_execution_count, expected_exec_cnt) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Model is removed so inference should fail try: iu.infer_exact(self, 'savedmodel', tensor_shape, 1, np.float32, np.float32, np.float32, swap=True) self.assertTrue(False, "expected error for unavailable model " + savedmodel_name) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertTrue( ex.message().startswith( "Inference request for unknown model 'savedmodel_float32_float32_float32'")) # Add back the same model. The status/stats should be reset. try: shutil.copytree(savedmodel_name, "models/" + savedmodel_name) time.sleep(5) # wait for model to load for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(savedmodel_name in ss.model_status, "expected status for model " + savedmodel_name) for (k, v) in iteritems(ss.model_status[savedmodel_name].version_status): self.assertEqual(v.ready_state, server_status.MODEL_READY) self.assertEqual(v.model_execution_count, 0) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Remove original model from the model store and give it time # to unload. Make sure that it has a status but is # unavailable. try: shutil.rmtree("models/" + netdef_name) time.sleep(5) # wait for model to unload for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: ctx = ServerStatusContext(pair[0], pair[1], netdef_name, True) ss = ctx.get_server_status() self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) self.assertEqual("inference:0", ss.id) self.assertEqual(server_status.SERVER_READY, ss.ready_state) self.assertEqual(len(ss.model_status), 1) self.assertTrue(netdef_name in ss.model_status, "expected status for model " + netdef_name) self.assertTrue(3 in ss.model_status[netdef_name].version_status, "expected status for version 3 of model " + netdef_name) version_status = ss.model_status[netdef_name].version_status[3] self.assertEqual(version_status.ready_state, server_status.MODEL_UNAVAILABLE) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Model is removed so inference should fail try: iu.infer_exact(self, 'netdef', tensor_shape, 1, np.float32, np.float32, np.float32, swap=True) self.assertTrue(False, "expected error for unavailable model " + netdef_name) except InferenceServerException as ex: self.assertEqual("inference:0", ex.server_id()) self.assertTrue( ex.message().startswith( "Inference request for unknown model 'netdef_float32_float32_float32'"))
def infer_exact(tester, pf, tensor_shape, batch_size, req_raw, input_dtype, output0_dtype, output1_dtype, model_version=None, swap=False, outputs=("OUTPUT0", "OUTPUT1"), use_http=True, use_grpc=True, skip_request_id_check=False, send_input_shape=False): tester.assertTrue(use_http or use_grpc) protocols = [] if use_http: protocols.append(("localhost:8000", ProtocolType.HTTP)) if use_grpc: protocols.append(("localhost:8001", ProtocolType.GRPC)) for pair in protocols: model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype) # outputs are sum and difference of inputs so set max input # values so that they will not overflow the output. This # allows us to do an exact match. For float types use 8, 16, # 32 int range for fp 16, 32, 64 respectively. When getting # class outputs the result value/probability is returned as a # float so must use fp32 range in that case. rinput_dtype = _range_repr_dtype(input_dtype) routput0_dtype = _range_repr_dtype( output0_dtype if req_raw else np.float32) routput1_dtype = _range_repr_dtype( output1_dtype if req_raw else np.float32) val_min = max( np.iinfo(rinput_dtype).min, np.iinfo(routput0_dtype).min, np.iinfo(routput1_dtype).min) / 2 val_max = min( np.iinfo(rinput_dtype).max, np.iinfo(routput0_dtype).max, np.iinfo(routput1_dtype).max) / 2 num_classes = 3 input0_list = list() input1_list = list() expected0_list = list() expected1_list = list() expected0_val_list = list() expected1_val_list = list() for b in range(batch_size): in0 = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) in1 = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) if input_dtype != np.object: in0 = in0.astype(input_dtype) in1 = in1.astype(input_dtype) if not swap: op0 = in0 + in1 op1 = in0 - in1 else: op0 = in0 - in1 op1 = in0 + in1 expected0_val_list.append(op0) expected1_val_list.append(op1) if output0_dtype == np.object: expected0_list.append( np.array([ bytes(str(x), encoding='utf-8') for x in (op0.flatten()) ], dtype=object).reshape(op1.shape)) else: expected0_list.append(op0) if output1_dtype == np.object: expected1_list.append( np.array([ bytes(str(x), encoding='utf-8') for x in (op1.flatten()) ], dtype=object).reshape(op1.shape)) else: expected1_list.append(op1) if input_dtype == np.object: in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(in0.shape) in1n = np.array([str(x) for x in in1.reshape(in1.size)], dtype=object) in1 = in1n.reshape(in1.shape) input0_list.append(in0) input1_list.append(in1) expected0_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in expected0_val_list ] expected1_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in expected1_val_list ] output_req = {} for o in outputs: if req_raw: output_req[o] = InferContext.ResultFormat.RAW else: output_req[o] = (InferContext.ResultFormat.CLASS, num_classes) ctx = InferContext(pair[0], pair[1], model_name, model_version, True) results = ctx.run({ "INPUT0": input0_list, "INPUT1": input1_list }, output_req, batch_size, { "INPUT0": tensor_shape, "INPUT1": tensor_shape } if (send_input_shape) else None) if not skip_request_id_check: global _last_request_id min_request_id = _last_request_id + 1 request_id = ctx.get_last_request_id() _last_request_id = request_id tester.assertGreaterEqual(request_id, min_request_id) tester.assertEqual(ctx.get_last_request_model_name(), model_name) if model_version is not None: tester.assertEqual(ctx.get_last_request_model_version(), model_version) tester.assertEqual(len(results), len(outputs)) for (result_name, result_val) in iteritems(results): for b in range(batch_size): if req_raw: if result_name == "OUTPUT0": tester.assertTrue( np.array_equal(result_val[b], expected0_list[b]), "{}, OUTPUT0 expected: {}, got {}".format( model_name, expected0_list[b], result_val[b])) elif result_name == "OUTPUT1": tester.assertTrue( np.array_equal(result_val[b], expected1_list[b]), "{}, OUTPUT1 expected: {}, got {}".format( model_name, expected1_list[b], result_val[b])) else: tester.assertTrue( False, "unexpected raw result {}".format(result_name)) else: # num_classes values must be returned and must # match expected top values class_list = result_val[b] tester.assertEqual(len(class_list), num_classes) expected0_flatten = expected0_list[b].flatten() expected1_flatten = expected1_list[b].flatten() for idx, ctuple in enumerate(class_list): if result_name == "OUTPUT0": # can't compare indices since could have # different indices with the same # value/prob, so compare that the value of # each index equals the expected # value. Can only compare labels when the # indices are equal. tester.assertEqual(ctuple[1], expected0_flatten[ctuple[0]]) tester.assertEqual( ctuple[1], expected0_flatten[expected0_sort_idx[b][idx]]) if ctuple[0] == expected0_sort_idx[b][idx]: tester.assertEqual( ctuple[2], 'label{}'.format( expected0_sort_idx[b][idx])) elif result_name == "OUTPUT1": tester.assertEqual(ctuple[1], expected1_flatten[ctuple[0]]) tester.assertEqual( ctuple[1], expected1_flatten[expected1_sort_idx[b][idx]]) else: tester.assertTrue( False, "unexpected class result {}".format( result_name)) return results
def create_ensemble_modelconfig( base_model, models_dir, max_batch, model_version, input_shape, output0_shape, output1_shape, input_dtype, output0_dtype, output1_dtype, output0_label_cnt, version_policy): # No validation as long as the base model supports the type and shape # Unpack version policy version_policy_str = "{ latest { num_versions: 1 }}" if version_policy is not None: type, val = version_policy if type == 'latest': version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(val) elif type == 'specific': version_policy_str = "{{ specific {{ versions: {} }}}}".format(val) else: version_policy_str = "{ all { }}" input_model_dtype = np_to_model_dtype(input_dtype) output0_model_dtype = np_to_model_dtype(output0_dtype) output1_model_dtype = np_to_model_dtype(output1_dtype) for ensemble_type in BASIC_ENSEMBLE_TYPES: # Use a different model name for the non-batching variant ensemble_model_name = "{}_{}{}".format(ensemble_type, base_model, "_nobatch" if max_batch == 0 else "") model_name = tu.get_model_name(ensemble_model_name, input_dtype, output0_dtype, output1_dtype) base_model_name = tu.get_model_name("{}{}".format(base_model, "_nobatch" if max_batch == 0 else ""), input_dtype, output0_dtype, output1_dtype) ensemble_schedule = EnsembleSchedule(ensemble_type).get_schedule( base_model_name, input_shape, output0_shape, output1_shape, input_model_dtype, output0_model_dtype, output1_model_dtype) config_dir = models_dir + "/" + model_name config = ''' name: "{}" platform: "ensemble" max_batch_size: {} version_policy: {} input [ {{ name: "INPUT0" data_type: {} dims: [ {} ] }}, {{ name: "INPUT1" data_type: {} dims: [ {} ] }} ] output [ {{ name: "OUTPUT0" data_type: {} dims: [ {} ] label_filename: "output0_labels.txt" }}, {{ name: "OUTPUT1" data_type: {} dims: [ {} ] }} ] {} '''.format(model_name, max_batch, version_policy_str, input_model_dtype, tu.shape_to_dims_str(input_shape), input_model_dtype, tu.shape_to_dims_str(input_shape), output0_model_dtype, tu.shape_to_dims_str(output0_shape), output1_model_dtype, tu.shape_to_dims_str(output1_shape), ensemble_schedule) try: os.makedirs(config_dir) except OSError as ex: pass # ignore existing dir with open(config_dir + "/config.pbtxt", "w") as cfile: cfile.write(config) with open(config_dir + "/output0_labels.txt", "w") as lfile: for l in range(output0_label_cnt): lfile.write("label" + str(l) + "\n")
def infer_exact(tester, pf, tensor_shape, batch_size, input_dtype, output0_dtype, output1_dtype, output0_raw=True, output1_raw=True, model_version=None, swap=False, outputs=("OUTPUT0", "OUTPUT1"), use_http=True, use_grpc=True, use_http_json_tensors=True, skip_request_id_check=False, use_streaming=True, correlation_id=0, shm_region_names=None, precreated_shm_regions=None, use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue(use_http or use_grpc or use_streaming) # configs [ url, protocol, async stream, binary data ] configs = [] if use_http: configs.append(("localhost:8000", "http", False, True)) if output0_raw == output1_raw: # Float16 not supported for Input and Output via JSON if use_http_json_tensors and (input_dtype != np.float16) and \ (output0_dtype != np.float16) and (output1_dtype != np.float16): configs.append(("localhost:8000", "http", False, False)) if use_grpc: configs.append(("localhost:8001", "grpc", False, False)) if use_streaming: configs.append(("localhost:8001", "grpc", True, False)) # outputs are sum and difference of inputs so set max input # values so that they will not overflow the output. This # allows us to do an exact match. For float types use 8, 16, # 32 int range for fp 16, 32, 64 respectively. When getting # class outputs the result value/probability is returned as a # float so must use fp32 range in that case. rinput_dtype = _range_repr_dtype(input_dtype) routput0_dtype = _range_repr_dtype( output0_dtype if output0_raw else np.float32) routput1_dtype = _range_repr_dtype( output1_dtype if output1_raw else np.float32) val_min = max( np.iinfo(rinput_dtype).min, np.iinfo(routput0_dtype).min, np.iinfo(routput1_dtype).min) / 2 val_max = min( np.iinfo(rinput_dtype).max, np.iinfo(routput0_dtype).max, np.iinfo(routput1_dtype).max) / 2 num_classes = 3 input0_array = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) input1_array = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) if input_dtype != np.object: input0_array = input0_array.astype(input_dtype) input1_array = input1_array.astype(input_dtype) if not swap: output0_array = input0_array + input1_array output1_array = input0_array - input1_array else: output0_array = input0_array - input1_array output1_array = input0_array + input1_array if output0_dtype == np.object: output0_array = np.array([ unicode(str(x), encoding='utf-8') for x in (output0_array.flatten()) ], dtype=object).reshape(output0_array.shape) else: output0_array = output0_array.astype(output0_dtype) if output1_dtype == np.object: output1_array = np.array([ unicode(str(x), encoding='utf-8') for x in (output1_array.flatten()) ], dtype=object).reshape(output1_array.shape) else: output1_array = output1_array.astype(output1_dtype) if input_dtype == np.object: in0n = np.array( [str(x) for x in input0_array.reshape(input0_array.size)], dtype=object) input0_array = in0n.reshape(input0_array.shape) in1n = np.array( [str(x) for x in input1_array.reshape(input1_array.size)], dtype=object) input1_array = in1n.reshape(input1_array.shape) # prepend size of string to output string data if output0_dtype == np.object: if batch_size == 1: output0_array_tmp = serialize_byte_tensor_list([output0_array]) else: output0_array_tmp = serialize_byte_tensor_list(output0_array) else: output0_array_tmp = output0_array if output1_dtype == np.object: if batch_size == 1: output1_array_tmp = serialize_byte_tensor_list([output1_array]) else: output1_array_tmp = serialize_byte_tensor_list(output1_array) else: output1_array_tmp = output1_array # Get model platform model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype) if configs[0][1] == "http": metadata_client = httpclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata["platform"] else: metadata_client = grpcclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata.platform if platform == "pytorch_libtorch": OUTPUT0 = "OUTPUT__0" OUTPUT1 = "OUTPUT__1" INPUT0 = "INPUT__0" INPUT1 = "INPUT__1" else: OUTPUT0 = "OUTPUT0" OUTPUT1 = "OUTPUT1" INPUT0 = "INPUT0" INPUT1 = "INPUT1" output0_byte_size = sum([o0.nbytes for o0 in output0_array_tmp]) output1_byte_size = sum([o1.nbytes for o1 in output1_array_tmp]) if batch_size == 1: input0_list = [input0_array] input1_list = [input1_array] else: input0_list = [x for x in input0_array] input1_list = [x for x in input1_array] # Serialization of string tensors in the case of shared memory must be done manually if input_dtype == np.object: input0_list_tmp = serialize_byte_tensor_list(input0_list) input1_list_tmp = serialize_byte_tensor_list(input1_list) else: input0_list_tmp = input0_list input1_list_tmp = input1_list input0_byte_size = sum([i0.nbytes for i0 in input0_list_tmp]) input1_byte_size = sum([i1.nbytes for i1 in input1_list_tmp]) # Create system/cuda shared memory regions if needed shm_regions, shm_handles = su.create_set_shm_regions( input0_list_tmp, input1_list_tmp, output0_byte_size, output1_byte_size, outputs, shm_region_names, precreated_shm_regions, use_system_shared_memory, use_cuda_shared_memory) if model_version is not None: model_version = str(model_version) else: model_version = "" # Run inference and check results for each config for config in configs: model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype) if config[1] == "http": triton_client = httpclient.InferenceServerClient(config[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient(config[0], verbose=True) inputs = [] if config[1] == "http": inputs.append( httpclient.InferInput(INPUT0, tensor_shape, np_to_triton_dtype(input_dtype))) inputs.append( httpclient.InferInput(INPUT1, tensor_shape, np_to_triton_dtype(input_dtype))) else: inputs.append( grpcclient.InferInput(INPUT0, tensor_shape, np_to_triton_dtype(input_dtype))) inputs.append( grpcclient.InferInput(INPUT1, tensor_shape, np_to_triton_dtype(input_dtype))) if not (use_cuda_shared_memory or use_system_shared_memory): if config[1] == "http": inputs[0].set_data_from_numpy(input0_array, binary_data=config[3]) inputs[1].set_data_from_numpy(input1_array, binary_data=config[3]) else: inputs[0].set_data_from_numpy(input0_array) inputs[1].set_data_from_numpy(input1_array) else: # Register necessary shared memory regions/handles su.register_add_shm_regions(inputs, outputs, shm_regions, precreated_shm_regions, shm_handles, input0_byte_size, input1_byte_size, output0_byte_size, output1_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client) if batch_size == 1: expected0_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output0_array.reshape((1, ) + tensor_shape) ] expected1_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output1_array.reshape((1, ) + tensor_shape) ] else: expected0_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output0_array.reshape(tensor_shape) ] expected1_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output1_array.reshape(tensor_shape) ] # Force binary_data = False for shared memory and class output_req = [] i = 0 if "OUTPUT0" in outputs: if len(shm_regions) != 0: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput(OUTPUT0, binary_data=config[3])) else: output_req.append(grpcclient.InferRequestedOutput(OUTPUT0)) output_req[-1].set_shared_memory(shm_regions[2] + '_data', output0_byte_size) else: if output0_raw: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT0, binary_data=config[3])) else: output_req.append( grpcclient.InferRequestedOutput(OUTPUT0)) else: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT0, binary_data=config[3], class_count=num_classes)) else: output_req.append( grpcclient.InferRequestedOutput( OUTPUT0, class_count=num_classes)) i += 1 if "OUTPUT1" in outputs: if len(shm_regions) != 0: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput(OUTPUT1, binary_data=config[3])) else: output_req.append(grpcclient.InferRequestedOutput(OUTPUT1)) output_req[-1].set_shared_memory(shm_regions[2 + i] + '_data', output1_byte_size) else: if output1_raw: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT1, binary_data=config[3])) else: output_req.append( grpcclient.InferRequestedOutput(OUTPUT1)) else: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT1, binary_data=config[3], class_count=num_classes)) else: output_req.append( grpcclient.InferRequestedOutput( OUTPUT1, class_count=num_classes)) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer( model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id())) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id())) last_response = results.get_response() if not skip_request_id_check: global _seen_request_ids if config[1] == "http": request_id = int(last_response["id"]) else: request_id = int(last_response.id) tester.assertFalse(request_id in _seen_request_ids, "request_id: {}".format(request_id)) _seen_request_ids.add(request_id) if config[1] == "http": response_model_name = last_response["model_name"] if model_version != "": response_model_version = last_response["model_version"] response_outputs = last_response["outputs"] else: response_model_name = last_response.model_name if model_version != "": response_model_version = last_response.model_version response_outputs = last_response.outputs tester.assertEqual(response_model_name, model_name) if model_version != "": tester.assertEqual(str(response_model_version), model_version) tester.assertEqual(len(response_outputs), len(outputs)) for result in response_outputs: if config[1] == "http": result_name = result["name"] else: result_name = result.name if ((result_name == OUTPUT0 and output0_raw) or (result_name == OUTPUT1 and output1_raw)): if use_system_shared_memory or use_cuda_shared_memory: if result_name == OUTPUT0: shm_handle = shm_handles[2] else: shm_handle = shm_handles[3] output = results.get_output(result_name) if config[1] == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = triton_to_np_dtype(output_datatype) if use_system_shared_memory: output_data = shm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) elif use_cuda_shared_memory: output_data = cudashm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) else: output_data = results.as_numpy(result_name) if (output_data.dtype == np.object) and (config[3] == False): output_data = output_data.astype(np.bytes_) if result_name == OUTPUT0: tester.assertTrue( np.array_equal(output_data, output0_array), "{}, {} expected: {}, got {}".format( model_name, OUTPUT0, output0_array, output_data)) elif result_name == OUTPUT1: tester.assertTrue( np.array_equal(output_data, output1_array), "{}, {} expected: {}, got {}".format( model_name, OUTPUT1, output1_array, output_data)) else: tester.assertTrue( False, "unexpected raw result {}".format(result_name)) else: for b in range(batch_size): # num_classes values must be returned and must # match expected top values if "nobatch" in pf: class_list = results.as_numpy(result_name) else: class_list = results.as_numpy(result_name)[b] tester.assertEqual(len(class_list), num_classes) if batch_size == 1: expected0_flatten = output0_array.flatten() expected1_flatten = output1_array.flatten() else: expected0_flatten = output0_array[b].flatten() expected1_flatten = output1_array[b].flatten() for idx, class_label in enumerate(class_list): # can't compare indices since could have different # indices with the same value/prob, so check that # the value of each index equals the expected value. # Only compare labels when the indices are equal. if type(class_label) == str: ctuple = class_label.split(':') else: ctuple = "".join(chr(x) for x in class_label).split(':') cval = float(ctuple[0]) cidx = int(ctuple[1]) if result_name == OUTPUT0: tester.assertEqual(cval, expected0_flatten[cidx]) tester.assertEqual( cval, expected0_flatten[expected0_sort_idx[b][idx]]) if cidx == expected0_sort_idx[b][idx]: tester.assertEqual( ctuple[2], 'label{}'.format( expected0_sort_idx[b][idx])) elif result_name == OUTPUT1: tester.assertEqual(cval, expected1_flatten[cidx]) tester.assertEqual( cval, expected1_flatten[expected1_sort_idx[b][idx]]) else: tester.assertTrue( False, "unexpected class result {}".format( result_name)) # Unregister system/cuda shared memory regions if they exist su.unregister_cleanup_shm_regions(shm_regions, shm_handles, precreated_shm_regions, outputs, use_system_shared_memory, use_cuda_shared_memory) return results
def test_load_wrong_optimization_profile(self): client = tritonhttpclient.InferenceServerClient("localhost:8000") model_name = tu.get_model_name(self.model_name_, self.dtype_, self.dtype_, self.dtype_) model_status = client.is_model_ready(model_name, "1") self.assertFalse(model_status, "expected model to be not ready")
def create_plan_fixed_modelfile(models_dir, max_batch, model_version, input_shape, output0_shape, output1_shape, input_dtype, output0_dtype, output1_dtype, input_memory_format, output_memory_format): trt_input_dtype = np_to_trt_dtype(input_dtype) trt_output0_dtype = np_to_trt_dtype(output0_dtype) trt_output1_dtype = np_to_trt_dtype(output1_dtype) trt_input_memory_format = input_memory_format trt_output_memory_format = output_memory_format # Create the model TRT_LOGGER = trt.Logger(trt.Logger.INFO) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() in0 = network.add_input("INPUT0", trt_input_dtype, input_shape) in1 = network.add_input("INPUT1", trt_input_dtype, input_shape) add = network.add_elementwise(in0, in1, trt.ElementWiseOperation.SUM) sub = network.add_elementwise(in0, in1, trt.ElementWiseOperation.SUB) out0 = network.add_identity(add.get_output(0)) out1 = network.add_identity(sub.get_output(0)) out0.get_output(0).name = "OUTPUT0" out1.get_output(0).name = "OUTPUT1" network.mark_output(out0.get_output(0)) network.mark_output(out1.get_output(0)) out0.get_output(0).dtype = trt_output0_dtype out1.get_output(0).dtype = trt_output1_dtype in0.allowed_formats = 1 << int(trt_input_memory_format) in1.allowed_formats = 1 << int(trt_input_memory_format) out0.get_output(0).allowed_formats = 1 << int(trt_output_memory_format) out1.get_output(0).allowed_formats = 1 << int(trt_output_memory_format) if (trt_input_dtype == trt.int8): in0.dynamic_range = (-128.0, 127.0) in1.dynamic_range = (-128.0, 127.0) if (trt_output0_dtype == trt.int8): out0.get_output(0).dynamic_range = (-128.0, 127.0) if (trt_output1_dtype == trt.int8): out1.get_output(0).dynamic_range = (-128.0, 127.0) flags = 1 << int(trt.BuilderFlag.STRICT_TYPES) datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype]) for dt in datatype_set: if (dt == trt.int8): flags |= 1 << int(trt.BuilderFlag.INT8) elif (dt == trt.float16): flags |= 1 << int(trt.BuilderFlag.FP16) config = builder.create_builder_config() config.flags = flags config.max_workspace_size = 1 << 20 builder.max_batch_size = max(1, max_batch) engine = builder.build_engine(network, config) base_name = "plan_nobatch" if max_batch == 0 else "plan" base_name += "_" + trt_format_to_string( input_memory_format) + "_" + trt_format_to_string(output_memory_format) model_name = tu.get_model_name(base_name, input_dtype, output0_dtype, output1_dtype) model_version_dir = models_dir + "/" + model_name + "/" + str( model_version) try: os.makedirs(model_version_dir) except OSError as ex: pass # ignore existing dir with open(model_version_dir + "/model.plan", "wb") as f: f.write(engine.serialize()) del engine del builder
def test_multi_batch_delayed_preferred_different_shape(self): # Send two requests with total static batch size in between # preferred sizes. Then send a request with a different shape # and a non-preferred batch size. Use # TRTSERVER_DELAY_SCHEDULER in the environment so that # requests can be queued up before scheduler starts # servicing. This should cause the first two requests to be # immediately responded to. Send a forth request with the same # shape as the third that causes a preferred size so that # third and forth response are sent immediately. for trial in _trials: try: url = "localhost:8000" protocol = ProtocolType.HTTP model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) self.check_setup(url, protocol, model_name) # Need scheduler to wait for queue to contain 4 requests self.assertTrue("TRTSERVER_DELAY_SCHEDULER" in os.environ) self.assertEqual(int(os.environ["TRTSERVER_DELAY_SCHEDULER"]), 4) threads = [] threads.append( threading.Thread(target=self.check_response, args=(trial, 1, (3000, None)), kwargs={ 'shm_region_names': ['ip00', 'ip01', 'op00', 'op01'] })) threads.append( threading.Thread(target=self.check_response, args=(trial, 3, (3000, None)), kwargs={ 'shm_region_names': ['ip10', 'ip11', 'op10', 'op11'] })) threads.append( threading.Thread(target=self.check_response, args=(trial, 1, (3000, None)), kwargs={ 'input_size': 8, 'shm_region_names': ['ip20', 'ip21', 'op20', 'op21'] })) threads.append( threading.Thread(target=self.check_response, args=(trial, 5, (3000, None)), kwargs={ 'input_size': 8, 'shm_region_names': ['ip30', 'ip31', 'op30', 'op31'] })) threads[0].start() threads[1].start() time.sleep(1) threads[2].start() threads[3].start() for t in threads: t.join() self.check_deferred_exception() self.check_status(url, protocol, model_name, (1, 3, 5), 2, 10) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def create_plan_modelconfig(models_dir, max_batch, model_version, input_shape, output0_shape, output1_shape, input_dtype, output0_dtype, output1_dtype, input_memory_format, output_memory_format, version_policy): if not tu.validate_for_trt_model(input_dtype, output0_dtype, output1_dtype, input_shape, output0_shape, output1_shape): return # Unpack version policy version_policy_str = "{ latest { num_versions: 1 }}" if version_policy is not None: type, val = version_policy if type == 'latest': version_policy_str = "{{ latest {{ num_versions: {} }}}}".format( val) elif type == 'specific': version_policy_str = "{{ specific {{ versions: {} }}}}".format(val) else: version_policy_str = "{ all { }}" # Use a different model name for different kinds of models base_name = "plan_nobatch" if max_batch == 0 else "plan" base_name += "_" + trt_format_to_string( input_memory_format) + "_" + trt_format_to_string(output_memory_format) model_name = tu.get_model_name(base_name, input_dtype, output0_dtype, output1_dtype) config_dir = models_dir + "/" + model_name if -1 in input_shape: profile_index = 0 config = ''' name: "{}" platform: "tensorrt_plan" max_batch_size: {} version_policy: {} input [ {{ name: "INPUT0" data_type: {} dims: [ {} ] }}, {{ name: "INPUT1" data_type: {} dims: [ {} ] }} ] output [ {{ name: "OUTPUT0" data_type: {} dims: [ {} ] }}, {{ name: "OUTPUT1" data_type: {} dims: [ {} ] }} ] instance_group [ {{ profile:"{}" }} ] '''.format(model_name, max_batch, version_policy_str, np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape), np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape), np_to_model_dtype(output0_dtype), tu.shape_to_dims_str(output0_shape), np_to_model_dtype(output1_dtype), tu.shape_to_dims_str(output1_shape), profile_index) else: config = ''' name: "{}" platform: "tensorrt_plan" max_batch_size: {} version_policy: {} input [ {{ name: "INPUT0" data_type: {} dims: [ {} ] }}, {{ name: "INPUT1" data_type: {} dims: [ {} ] }} ] output [ {{ name: "OUTPUT0" data_type: {} dims: [ {} ] }}, {{ name: "OUTPUT1" data_type: {} dims: [ {} ] }} ] '''.format(model_name, max_batch, version_policy_str, np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape), np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape), np_to_model_dtype(output0_dtype), tu.shape_to_dims_str(output0_shape), np_to_model_dtype(output1_dtype), tu.shape_to_dims_str(output1_shape)) try: os.makedirs(config_dir) except OSError as ex: pass # ignore existing dir with open(config_dir + "/config.pbtxt", "w") as cfile: cfile.write(config)
def create_plan_dynamic_modelfile(models_dir, max_batch, model_version, input_shape, output0_shape, output1_shape, input_dtype, output0_dtype, output1_dtype, input_memory_format, output_memory_format, min_dim=1, max_dim=64): trt_input_dtype = np_to_trt_dtype(input_dtype) trt_output0_dtype = np_to_trt_dtype(output0_dtype) trt_output1_dtype = np_to_trt_dtype(output1_dtype) trt_input_memory_format = input_memory_format trt_output_memory_format = output_memory_format # Create the model TRT_LOGGER = trt.Logger(trt.Logger.INFO) builder = trt.Builder(TRT_LOGGER) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) if max_batch == 0: input_with_batchsize = [i for i in input_shape] else: input_with_batchsize = [-1] + [i for i in input_shape] in0 = network.add_input("INPUT0", trt_input_dtype, input_with_batchsize) in1 = network.add_input("INPUT1", trt_input_dtype, input_with_batchsize) add = network.add_elementwise(in0, in1, trt.ElementWiseOperation.SUM) sub = network.add_elementwise(in0, in1, trt.ElementWiseOperation.SUB) out0 = network.add_identity(add.get_output(0)) out1 = network.add_identity(sub.get_output(0)) out0.get_output(0).name = "OUTPUT0" out1.get_output(0).name = "OUTPUT1" network.mark_output(out0.get_output(0)) network.mark_output(out1.get_output(0)) out0.get_output(0).dtype = trt_output0_dtype out1.get_output(0).dtype = trt_output1_dtype in0.allowed_formats = 1 << int(trt_input_memory_format) in1.allowed_formats = 1 << int(trt_input_memory_format) out0.get_output(0).allowed_formats = 1 << int(trt_output_memory_format) out1.get_output(0).allowed_formats = 1 << int(trt_output_memory_format) if (trt_input_dtype == trt.int8): in0.dynamic_range = (-128.0, 127.0) in1.dynamic_range = (-128.0, 127.0) if (trt_output0_dtype == trt.int8): out0.get_output(0).dynamic_range = (-128.0, 127.0) if (trt_output1_dtype == trt.int8): out1.get_output(0).dynamic_range = (-128.0, 127.0) min_shape = [] opt_shape = [] max_shape = [] if max_batch != 0: min_shape = min_shape + [1] opt_shape = opt_shape + [max(1, max_batch)] max_shape = max_shape + [max(1, max_batch)] for i in input_shape: if i == -1: min_shape = min_shape + [min_dim] opt_shape = opt_shape + [int((max_dim + min_dim) / 2)] max_shape = max_shape + [max_dim] else: min_shape = min_shape + [i] opt_shape = opt_shape + [i] max_shape = max_shape + [i] profile = builder.create_optimization_profile() profile.set_shape("INPUT0", min_shape, opt_shape, max_shape) profile.set_shape("INPUT1", min_shape, opt_shape, max_shape) flags = 1 << int(trt.BuilderFlag.STRICT_TYPES) datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype]) for dt in datatype_set: if (dt == trt.int8): flags |= 1 << int(trt.BuilderFlag.INT8) elif (dt == trt.float16): flags |= 1 << int(trt.BuilderFlag.FP16) config = builder.create_builder_config() config.flags = flags config.add_optimization_profile(profile) config.max_workspace_size = 1 << 20 engine = builder.build_engine(network, config) # Use a different model name for different kinds of models base_name = "plan_nobatch" if max_batch == 0 else "plan" base_name += "_" + trt_format_to_string( input_memory_format) + "_" + trt_format_to_string(output_memory_format) model_name = tu.get_model_name(base_name, input_dtype, output0_dtype, output1_dtype) model_version_dir = models_dir + "/" + model_name + "/" + str( model_version) try: os.makedirs(model_version_dir) except OSError as ex: pass # ignore existing dir with open(model_version_dir + "/model.plan", "wb") as f: f.write(engine.serialize()) del engine del builder