Beispiel #1
0
    def test_model_latest_infer(self):
        input_size = 16
        tensor_shape = (1, input_size)
        platform_name = {
            'graphdef': 'tensorflow_graphdef',
            'netdef': 'caffe2_netdef'
        }

        # There are 3 versions of *_int32_int32_int32 and all
        # should be available.
        for platform in ('graphdef', 'netdef'):
            model_name = platform + "_int32_int32_int32"

            # Initially there should be no version stats..
            try:
                for pair in [("localhost:8000", "http"),
                             ("localhost:8001", "grpc")]:
                    if pair[1] == "http":
                        triton_client = httpclient.InferenceServerClient(
                            url=pair[0], verbose=True)
                    else:
                        triton_client = grpcclient.InferenceServerClient(
                            url=pair[0], verbose=True)

                    self.assertTrue(triton_client.is_server_live())
                    self.assertTrue(triton_client.is_server_ready())
                    model_metadata = triton_client.get_model_metadata(
                        model_name)
                    # verify all versions are reported when no model version is specified
                    if pair[1] == "http":
                        self.assertEqual(model_name, model_metadata['name'])
                        self.assertEqual(len(model_metadata['versions']), 3)
                        for v in (1, 2, 3):
                            self.assertTrue(
                                str(v) in model_metadata['versions'])
                    else:
                        self.assertEqual(model_name, model_metadata.name)
                        self.assertEqual(len(model_metadata.versions), 3)
                        for v in (1, 2, 3):
                            self.assertTrue(str(v) in model_metadata.versions)

                    # verify contents of model metadata
                    if pair[1] == "http":
                        model_platform = model_metadata['platform']
                        model_inputs = model_metadata['inputs']
                        model_outputs = model_metadata['outputs']
                    else:
                        model_platform = model_metadata.platform
                        model_inputs = model_metadata.inputs
                        model_outputs = model_metadata.outputs

                    self.assertEqual(platform_name[platform], model_platform)
                    self.assertEqual(len(model_inputs), 2)
                    self.assertEqual(len(model_outputs), 2)

                    for model_input in model_inputs:
                        if pair[1] == "http":
                            input_dtype = model_input['datatype']
                            input_shape = model_input['shape']
                            input_name = model_input['name']
                        else:
                            input_dtype = model_input.datatype
                            input_shape = model_input.shape
                            input_name = model_input.name
                        self.assertTrue(input_name in ["INPUT0", "INPUT1"])
                        self.assertEqual(input_dtype, "INT32")
                        self.assertEqual(input_shape, [16])

                    for model_output in model_outputs:
                        if pair[1] == "http":
                            output_dtype = model_output['datatype']
                            output_shape = model_output['shape']
                            output_name = model_output['name']
                        else:
                            output_dtype = model_output.datatype
                            output_shape = model_output.shape
                            output_name = model_output.name
                        self.assertTrue(output_name in ["OUTPUT0", "OUTPUT1"])
                        self.assertEqual(output_dtype, "INT32")
                        self.assertEqual(output_shape, [16])

            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))

            # Infer using latest version (which is 3)...
            iu.infer_exact(self,
                           platform,
                           tensor_shape,
                           1,
                           np.int32,
                           np.int32,
                           np.int32,
                           model_version=None,
                           swap=True)

            try:
                for pair in [("localhost:8000", "http"),
                             ("localhost:8001", "grpc")]:
                    if pair[1] == "http":
                        triton_client = httpclient.InferenceServerClient(
                            url=pair[0], verbose=True)
                    else:
                        triton_client = grpcclient.InferenceServerClient(
                            url=pair[0], verbose=True)

                    self.assertTrue(triton_client.is_server_live())
                    self.assertTrue(triton_client.is_server_ready())
                    for v in (1, 2, 3):
                        self.assertTrue(
                            triton_client.is_model_ready(model_name,
                                                         model_version=str(v)))

                    # Only version 3 should have infer stats
                    infer_stats = triton_client.get_inference_statistics(
                        model_name)
                    if pair[1] == "http":
                        stats = infer_stats['model_stats']
                    else:
                        stats = infer_stats.model_stats
                    self.assertEqual(
                        len(stats), 3,
                        "expected 3 infer stats for model " + model_name)
                    for s in stats:
                        if pair[1] == "http":
                            v = s['version']
                            stat = s['inference_stats']
                        else:
                            v = s.version
                            stat = s.inference_stats

                        if v == "3":
                            if pair[1] == "http":
                                self.assertTrue(stat['success']['count'], 3)
                            else:
                                self.assertTrue(stat.success.count, 3)
                        else:
                            if pair[1] == "http":
                                self.assertEqual(
                                    stat['success']['count'], 0,
                                    "unexpected infer success counts for version "
                                    + str(v) + " of model " + model_name)
                            else:
                                self.assertEqual(
                                    stat.success.count, 0,
                                    "unexpected infer success counts for version "
                                    + str(v) + " of model " + model_name)

            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))
        help="Inference server URL. Default is localhost:8001.")
    parser.add_argument('-v',
                        "--verbose",
                        action="store_true",
                        required=False,
                        default=False,
                        help='Enable verbose output')
    parser.add_argument(
        "--label_file",
        type=str,
        default="./model_repository/resnet50_trt/labels.txt",
        help="Path to the file with text representation of available labels")
    args = parser.parse_args()

    try:
        triton_client = tritongrpcclient.InferenceServerClient(
            url=args.url, verbose=args.verbose)
    except Exception as e:
        print("channel creation failed: " + str(e))
        sys.exit(1)

    with open(args.label_file) as f:
        labels_dict = {idx: line.strip() for idx, line in enumerate(f)}

    inputs = []
    outputs = []
    input_name = "INPUT"
    output_name = "OUTPUT"
    image_data = load_image(args.image)
    image_data = np.expand_dims(image_data, axis=0)

    inputs.append(
Beispiel #3
0
                        help='Protocol (HTTP/gRPC) used to communicate with ' +
                        'the inference service. Default is HTTP.')
    parser.add_argument('image_filename',
                        type=str,
                        nargs='?',
                        default=None,
                        help='Input image / Input folder.')
    FLAGS = parser.parse_args()

    if FLAGS.streaming and FLAGS.protocol.lower() != "grpc":
        raise Exception("Streaming is only allowed with gRPC protocol")

    try:
        if FLAGS.protocol.lower() == "grpc":
            # Create gRPC client for communicating with the server
            triton_client = tritongrpcclient.InferenceServerClient(
                url=FLAGS.url, verbose=FLAGS.verbose)
        else:
            # Create HTTP client for communicating with the server
            triton_client = tritonhttpclient.InferenceServerClient(
                url=FLAGS.url, verbose=FLAGS.verbose)
    except Exception as e:
        print("client creation failed: " + str(e))
        sys.exit(1)

    # Make sure the model matches our requirements, and get some
    # properties of the model that we need for preprocessing
    try:
        model_metadata = triton_client.get_model_metadata(
            model_name=FLAGS.model_name, model_version=FLAGS.model_version)
    except InferenceServerException as e:
        print("failed to retrieve the metadata: " + str(e))
Beispiel #4
0
def infer_exact(tester,
                pf,
                tensor_shape,
                batch_size,
                input_dtype,
                output0_dtype,
                output1_dtype,
                output0_raw=True,
                output1_raw=True,
                model_version=None,
                swap=False,
                outputs=("OUTPUT0", "OUTPUT1"),
                use_http=True,
                use_grpc=True,
                use_http_json_tensors=True,
                skip_request_id_check=False,
                use_streaming=True,
                correlation_id=0,
                shm_region_names=None,
                precreated_shm_regions=None,
                use_system_shared_memory=False,
                use_cuda_shared_memory=False,
                priority=0,
                timeout_us=0):
    tester.assertTrue(use_http or use_http_json_tensors or use_grpc
                      or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False, True))
    if output0_raw == output1_raw:
        # Float16 not supported for Input and Output via JSON
        if use_http_json_tensors and (input_dtype != np.float16) and \
            (output0_dtype != np.float16) and (output1_dtype != np.float16):
            configs.append(("localhost:8000", "http", False, False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False, False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True, False))

    # outputs are sum and difference of inputs so set max input
    # values so that they will not overflow the output. This
    # allows us to do an exact match. For float types use 8, 16,
    # 32 int range for fp 16, 32, 64 respectively. When getting
    # class outputs the result value/probability is returned as a
    # float so must use fp32 range in that case.
    rinput_dtype = _range_repr_dtype(input_dtype)
    routput0_dtype = _range_repr_dtype(
        output0_dtype if output0_raw else np.float32)
    routput1_dtype = _range_repr_dtype(
        output1_dtype if output1_raw else np.float32)
    val_min = max(
        np.iinfo(rinput_dtype).min,
        np.iinfo(routput0_dtype).min,
        np.iinfo(routput1_dtype).min) / 2
    val_max = min(
        np.iinfo(rinput_dtype).max,
        np.iinfo(routput0_dtype).max,
        np.iinfo(routput1_dtype).max) / 2

    num_classes = 3

    input0_array = np.random.randint(low=val_min,
                                     high=val_max,
                                     size=tensor_shape,
                                     dtype=rinput_dtype)
    input1_array = np.random.randint(low=val_min,
                                     high=val_max,
                                     size=tensor_shape,
                                     dtype=rinput_dtype)
    if input_dtype != np.object:
        input0_array = input0_array.astype(input_dtype)
        input1_array = input1_array.astype(input_dtype)

    if not swap:
        output0_array = input0_array + input1_array
        output1_array = input0_array - input1_array
    else:
        output0_array = input0_array - input1_array
        output1_array = input0_array + input1_array

    if output0_dtype == np.object:
        output0_array = np.array([
            unicode(str(x), encoding='utf-8')
            for x in (output0_array.flatten())
        ],
                                 dtype=object).reshape(output0_array.shape)
    else:
        output0_array = output0_array.astype(output0_dtype)
    if output1_dtype == np.object:
        output1_array = np.array([
            unicode(str(x), encoding='utf-8')
            for x in (output1_array.flatten())
        ],
                                 dtype=object).reshape(output1_array.shape)
    else:
        output1_array = output1_array.astype(output1_dtype)

    if input_dtype == np.object:
        in0n = np.array(
            [str(x) for x in input0_array.reshape(input0_array.size)],
            dtype=object)
        input0_array = in0n.reshape(input0_array.shape)
        in1n = np.array(
            [str(x) for x in input1_array.reshape(input1_array.size)],
            dtype=object)
        input1_array = in1n.reshape(input1_array.shape)

    # prepend size of string to output string data
    if output0_dtype == np.object:
        if batch_size == 1:
            output0_array_tmp = serialize_byte_tensor_list([output0_array])
        else:
            output0_array_tmp = serialize_byte_tensor_list(output0_array)
    else:
        output0_array_tmp = output0_array

    if output1_dtype == np.object:
        if batch_size == 1:
            output1_array_tmp = serialize_byte_tensor_list([output1_array])
        else:
            output1_array_tmp = serialize_byte_tensor_list(output1_array)
    else:
        output1_array_tmp = output1_array

    OUTPUT0 = "OUTPUT0"
    OUTPUT1 = "OUTPUT1"
    INPUT0 = "INPUT0"
    INPUT1 = "INPUT1"
    if pf == "libtorch" or pf == "libtorch_nobatch":
        OUTPUT0 = "OUTPUT__0"
        OUTPUT1 = "OUTPUT__1"
        INPUT0 = "INPUT__0"
        INPUT1 = "INPUT__1"

    output0_byte_size = sum([o0.nbytes for o0 in output0_array_tmp])
    output1_byte_size = sum([o1.nbytes for o1 in output1_array_tmp])

    if batch_size == 1:
        input0_list = [input0_array]
        input1_list = [input1_array]
    else:
        input0_list = [x for x in input0_array]
        input1_list = [x for x in input1_array]

    # Serialization of string tensors in the case of shared memory must be done manually
    if input_dtype == np.object:
        input0_list_tmp = serialize_byte_tensor_list(input0_list)
        input1_list_tmp = serialize_byte_tensor_list(input1_list)
    else:
        input0_list_tmp = input0_list
        input1_list_tmp = input1_list

    input0_byte_size = sum([i0.nbytes for i0 in input0_list_tmp])
    input1_byte_size = sum([i1.nbytes for i1 in input1_list_tmp])

    # Create system/cuda shared memory regions if needed
    shm_regions, shm_handles = su.create_set_shm_regions(
        input0_list_tmp, input1_list_tmp, output0_byte_size, output1_byte_size,
        outputs, shm_region_names, precreated_shm_regions,
        use_system_shared_memory, use_cuda_shared_memory)

    if model_version is not None:
        model_version = str(model_version)
    else:
        model_version = ""

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_model_name(pf, input_dtype, output0_dtype,
                                       output1_dtype)

        if config[1] == "http":
            triton_client = httpclient.InferenceServerClient(config[0],
                                                             verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(config[0],
                                                             verbose=True)

        inputs = []
        if config[1] == "http":
            inputs.append(
                httpclient.InferInput(INPUT0, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))
            inputs.append(
                httpclient.InferInput(INPUT1, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))
        else:
            inputs.append(
                grpcclient.InferInput(INPUT0, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))
            inputs.append(
                grpcclient.InferInput(INPUT1, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))

        if not (use_cuda_shared_memory or use_system_shared_memory):
            if config[1] == "http":
                inputs[0].set_data_from_numpy(input0_array,
                                              binary_data=config[3])
                inputs[1].set_data_from_numpy(input1_array,
                                              binary_data=config[3])
            else:
                inputs[0].set_data_from_numpy(input0_array)
                inputs[1].set_data_from_numpy(input1_array)
        else:
            # Register necessary shared memory regions/handles
            su.register_add_shm_regions(inputs, outputs, shm_regions,
                                        precreated_shm_regions, shm_handles,
                                        input0_byte_size, input1_byte_size,
                                        output0_byte_size, output1_byte_size,
                                        use_system_shared_memory,
                                        use_cuda_shared_memory, triton_client)

        if batch_size == 1:
            expected0_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output0_array.reshape((1, ) + tensor_shape)
            ]
            expected1_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output1_array.reshape((1, ) + tensor_shape)
            ]
        else:
            expected0_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output0_array.reshape(tensor_shape)
            ]
            expected1_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output1_array.reshape(tensor_shape)
            ]

        # Force binary_data = False for shared memory and class
        output_req = []
        i = 0
        if "OUTPUT0" in outputs:
            if len(shm_regions) != 0:
                if config[1] == "http":
                    output_req.append(
                        httpclient.InferRequestedOutput(OUTPUT0,
                                                        binary_data=False))
                else:
                    output_req.append(grpcclient.InferRequestedOutput(OUTPUT0))

                output_req[-1].set_shared_memory(shm_regions[2] + '_data',
                                                 output0_byte_size)
            else:
                if output0_raw:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT0, binary_data=config[3]))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(OUTPUT0))
                else:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT0,
                                binary_data=False,
                                class_count=num_classes))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(
                                OUTPUT0, class_count=num_classes))
            i += 1
        if "OUTPUT1" in outputs:
            if len(shm_regions) != 0:
                if config[1] == "http":
                    output_req.append(
                        httpclient.InferRequestedOutput(OUTPUT1,
                                                        binary_data=False))
                else:
                    output_req.append(grpcclient.InferRequestedOutput(OUTPUT1))

                output_req[-1].set_shared_memory(shm_regions[2 + i] + '_data',
                                                 output1_byte_size)
            else:
                if output1_raw:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT1, binary_data=config[3]))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(OUTPUT1))
                else:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT1,
                                binary_data=False,
                                class_count=num_classes))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(
                                OUTPUT1, class_count=num_classes))

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(
                    model_name,
                    inputs,
                    model_version=model_version,
                    outputs=output_req,
                    request_id=str(_unique_request_id()))
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()))

        last_response = results.get_response()

        if not skip_request_id_check:
            global _seen_request_ids
            if config[1] == "http":
                request_id = int(last_response["id"])
            else:
                request_id = int(last_response.id)
            tester.assertFalse(request_id in _seen_request_ids,
                               "request_id: {}".format(request_id))
            _seen_request_ids.add(request_id)

        if config[1] == "http":
            response_model_name = last_response["model_name"]
            if model_version != "":
                response_model_version = last_response["model_version"]
            response_outputs = last_response["outputs"]
        else:
            response_model_name = last_response.model_name
            if model_version != "":
                response_model_version = last_response.model_version
            response_outputs = last_response.outputs

        tester.assertEqual(response_model_name, model_name)

        if model_version != "":
            tester.assertEqual(str(response_model_version), model_version)

        tester.assertEqual(len(response_outputs), len(outputs))

        for result in response_outputs:
            if config[1] == "http":
                result_name = result["name"]
            else:
                result_name = result.name

            if ((result_name == OUTPUT0 and output0_raw)
                    or (result_name == OUTPUT1 and output1_raw)):
                if use_system_shared_memory or use_cuda_shared_memory:
                    if result_name == OUTPUT0:
                        shm_handle = shm_handles[2]
                    else:
                        shm_handle = shm_handles[3]

                    output = results.get_output(result_name)
                    if config[1] == "http":
                        output_datatype = output['datatype']
                        output_shape = output['shape']
                    else:
                        output_datatype = output.datatype
                        output_shape = output.shape
                    output_dtype = triton_to_np_dtype(output_datatype)
                if use_system_shared_memory:
                    output_data = shm.get_contents_as_numpy(
                        shm_handle, output_dtype, output_shape)
                elif use_cuda_shared_memory:
                    output_data = cudashm.get_contents_as_numpy(
                        shm_handle, output_dtype, output_shape)
                else:
                    output_data = results.as_numpy(result_name)

                if (output_data.dtype == np.object) and (config[3] == False):
                    output_data = output_data.astype(np.bytes_)

                if result_name == OUTPUT0:
                    tester.assertTrue(
                        np.array_equal(output_data, output0_array),
                        "{}, {} expected: {}, got {}".format(
                            model_name, OUTPUT0, output0_array, output_data))
                elif result_name == OUTPUT1:
                    tester.assertTrue(
                        np.array_equal(output_data, output1_array),
                        "{}, {} expected: {}, got {}".format(
                            model_name, OUTPUT1, output1_array, output_data))
                else:
                    tester.assertTrue(
                        False, "unexpected raw result {}".format(result_name))
            else:
                for b in range(batch_size):
                    # num_classes values must be returned and must
                    # match expected top values
                    if "nobatch" in pf:
                        class_list = results.as_numpy(result_name)
                    else:
                        class_list = results.as_numpy(result_name)[b]

                    tester.assertEqual(len(class_list), num_classes)
                    if batch_size == 1:
                        expected0_flatten = output0_array.flatten()
                        expected1_flatten = output1_array.flatten()
                    else:
                        expected0_flatten = output0_array[b].flatten()
                        expected1_flatten = output1_array[b].flatten()

                    for idx, class_label in enumerate(class_list):
                        # can't compare indices since could have different
                        # indices with the same value/prob, so check that
                        # the value of each index equals the expected value.
                        # Only compare labels when the indices are equal.
                        if type(class_label) == str:
                            ctuple = class_label.split(':')
                        else:
                            ctuple = "".join(chr(x)
                                             for x in class_label).split(':')
                        cval = float(ctuple[0])
                        cidx = int(ctuple[1])
                        if result_name == OUTPUT0:
                            tester.assertEqual(cval, expected0_flatten[cidx])
                            tester.assertEqual(
                                cval,
                                expected0_flatten[expected0_sort_idx[b][idx]])
                            if cidx == expected0_sort_idx[b][idx]:
                                tester.assertEqual(
                                    ctuple[2], 'label{}'.format(
                                        expected0_sort_idx[b][idx]))
                        elif result_name == OUTPUT1:
                            tester.assertEqual(cval, expected1_flatten[cidx])
                            tester.assertEqual(
                                cval,
                                expected1_flatten[expected1_sort_idx[b][idx]])
                        else:
                            tester.assertTrue(
                                False, "unexpected class result {}".format(
                                    result_name))

    # Unregister system/cuda shared memory regions if they exist
    su.unregister_cleanup_shm_regions(shm_regions, shm_handles,
                                      precreated_shm_regions, outputs,
                                      use_system_shared_memory,
                                      use_cuda_shared_memory)

    return results
Beispiel #5
0
def main():
    FLAGS = parse_args()
    try:
        triton_client = tritongrpcclient.InferenceServerClient(
            url=FLAGS.url, verbose=FLAGS.verbose)
    except Exception as e:
        print("channel creation failed: " + str(e))
        sys.exit(1)

    if not (triton_client.is_server_live() or triton_client.is_server_ready()
            or triton_client.is_model_ready(model_name=FLAGS.model_name)):
        print(
            "Error connecting to server: Server live {}. Server ready {}. Model ready {}"
            .format(triton_client.is_server_live,
                    triton_client.is_server_ready,
                    triton_client.is_model_ready(model_name=FLAGS.model_name)))
        sys.exit(1)

    model_name = FLAGS.model_name
    model_version = -1

    input_data = [
        randint(0, 255, size=randint(100), dtype='uint8')
        for _ in range(randint(100) * FLAGS.batch_size)
    ]
    input_data = array_from_list(input_data)

    # Infer
    outputs = []
    input_name = "DALI_INPUT_0"
    output_name = "DALI_OUTPUT_0"
    input_shape = list(input_data.shape)
    outputs.append(tritongrpcclient.InferRequestedOutput(output_name))

    for batch in batcher(input_data, FLAGS.batch_size):
        print("Input mean before backend processing:", np.mean(batch))
        input_shape[0] = np.shape(batch)[0]
        print("Batch size: ", input_shape[0])
        inputs = [
            tritongrpcclient.InferInput(input_name, input_shape, "UINT8")
        ]
        # Initialize the data
        inputs[0].set_data_from_numpy(batch)

        # Test with outputs
        results = triton_client.infer(model_name=model_name,
                                      inputs=inputs,
                                      outputs=outputs)

        # Get the output arrays from the results
        output0_data = results.as_numpy(output_name)
        print("Output mean after backend processing:", np.mean(output0_data))
        print("Output shape: ", np.shape(output0_data))
        if not math.isclose(np.mean(output0_data), np.mean(batch)):
            print("Pre/post average does not match")
            sys.exit(1)
        else:
            print("pass")

    statistics = triton_client.get_inference_statistics(model_name=model_name)
    if len(statistics.model_stats) != 1:
        print("FAILED: Inference Statistics")
        sys.exit(1)
Beispiel #6
0
    def test_infer_stats_no_model_version(self):
        # Originally There were 3 versions of *_int32_int32_int32 and
        # version 3 was executed once. Version 2 and 3 models were
        # deleted from the model repository so now only expect version 1 to
        # be ready and version 3 to show stats but not be ready.
        for platform in ('graphdef', 'netdef'):
            model_name = platform + "_int32_int32_int32"

            try:
                for pair in [("localhost:8000", "http"),
                             ("localhost:8001", "grpc")]:
                    if pair[1] == "http":
                        triton_client = httpclient.InferenceServerClient(
                            url=pair[0], verbose=True)
                    else:
                        triton_client = grpcclient.InferenceServerClient(
                            url=pair[0], verbose=True)

                    self.assertTrue(triton_client.is_server_live())
                    self.assertTrue(triton_client.is_server_ready())
                    model_metadata = triton_client.get_model_metadata(
                        model_name)
                    if pair[1] == "http":
                        self.assertEqual(model_name, model_metadata['name'])
                        self.assertEqual(len(model_metadata['versions']), 1)
                        self.assertEqual("1", model_metadata['versions'][0])
                    else:
                        self.assertEqual(model_name, model_metadata.name)
                        self.assertEqual(len(model_metadata.versions), 1)
                        self.assertEqual("1", model_metadata.versions[0])

                    # Only version 3 should have infer stats, only 1 is ready
                    for v in (1, 2, 3):
                        if v == 1:
                            self.assertTrue(
                                triton_client.is_model_ready(
                                    model_name, model_version=str(v)))
                        else:
                            self.assertFalse(
                                triton_client.is_model_ready(
                                    model_name, model_version=str(v)))

                    infer_stats = triton_client.get_inference_statistics(
                        model_name)
                    if pair[1] == "http":
                        stats = infer_stats['model_stats']
                    else:
                        stats = infer_stats.model_stats
                    self.assertEqual(
                        len(stats), 3,
                        "expected 3 infer stats for model " + model_name)

                    for s in stats:
                        if pair[1] == "http":
                            version = s['version']
                            stat = s['inference_stats']
                        else:
                            version = s.version
                            stat = s.inference_stats

                        if version != "3":
                            if pair[1] == "http":
                                self.assertEqual(
                                    stat['success']['count'], 0,
                                    "unexpected infer stats for version " +
                                    str(v) + " of model " + model_name)
                            else:
                                self.assertEqual(
                                    stat.success.count, 0,
                                    "unexpected infer stats for version " +
                                    str(v) + " of model " + model_name)
                        else:
                            if pair[1] == "http":
                                self.assertTrue(stat['success']['count'], 3)
                            else:
                                self.assertTrue(stat.success.count, 3)

            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))
Beispiel #7
0
    def req_loop(self):
        client = grpcclient.InferenceServerClient(self._server_url)

        inputs = [
            grpcclient.InferInput("INPUT0", self._shape,
                                  np_to_triton_dtype(self._dtype))
        ]

        self._inflight_requests = 0
        start_stat = client.get_inference_statistics(
            model_name=self._model_name)
        global _exit_signal

        while not _exit_signal:
            input_numpy = np.random.random_sample(self._shape).astype(
                self._dtype)
            inputs[0].set_data_from_numpy(input_numpy)
            self._input_data.append(input_numpy)

            with self._sync:

                def _check_can_send():
                    return self._inflight_requests < _inference_concurrency

                can_send = self._sync.wait_for(_check_can_send,
                                               timeout=_response_wait_time_s)
                self._tester.assertTrue(
                    can_send,
                    "client didn't receive a response within {}s".format(
                        _response_wait_time_s))

                callback = functools.partial(AsyncGrpcRunner._on_result, self)
                client.async_infer(
                    model_name=self._model_name,
                    inputs=inputs,
                    request_id="{}".format(self._num_sent_request),
                    callback=callback,
                )
                self._inflight_requests += 1
                self._num_sent_request += 1
                if (self._num_sent_request == _inference_count):
                    _exit_signal = True
                time.sleep(self._delay_ms / 1000.0)

        # wait till receive all requested data
        with self._sync:

            def _all_processed():
                return self._inflight_requests == 0

            self._processed_all = self._sync.wait_for(_all_processed,
                                                      _finish_wait_time_s)
            self._tester.assertTrue(
                self._processed_all,
                "the processing didn't complete even after waiting for {}s".
                format(_finish_wait_time_s))

        end_stat = client.get_inference_statistics(model_name=self._model_name)
        self._processed_request_count = end_stat.model_stats[
            0].inference_stats.success.count - start_stat.model_stats[
                0].inference_stats.success.count
Beispiel #8
0
def check_status(model_name):
    client = grpcclient.InferenceServerClient("localhost:8001",
                                              verbose=FLAGS.verbose)
    stats = client.get_inference_statistics(model_name)
    print(stats)
Beispiel #9
0
def main(_):
    """
    Ask a question of context on Triton.
    :param context: str
    :param question: str
    :param question_id: int
    :return:
    """
    os.environ[
        "TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false"  #causes memory fragmentation for bert leading to OOM

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    # Get the Data
    if FLAGS.predict_file:
        eval_examples = read_squad_examples(
            input_file=FLAGS.predict_file,
            is_training=False,
            version_2_with_negative=FLAGS.version_2_with_negative)
    elif FLAGS.question and FLAGS.answer:
        input_data = [{
            "paragraphs": [{
                "context": FLAGS.context,
                "qas": [{
                    "id": 0,
                    "question": FLAGS.question
                }]
            }]
        }]

        eval_examples = read_squad_examples(
            input_file=None,
            is_training=False,
            version_2_with_negative=FLAGS.version_2_with_negative,
            input_data=input_data)
    else:
        raise ValueError(
            "Either predict_file or question+answer need to defined")

    # Get Eval Features = Preprocessing
    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)

    convert_examples_to_features(examples=eval_examples[0:],
                                 tokenizer=tokenizer,
                                 max_seq_length=FLAGS.max_seq_length,
                                 doc_stride=FLAGS.doc_stride,
                                 max_query_length=FLAGS.max_query_length,
                                 is_training=False,
                                 output_fn=append_feature)

    protocol_str = 'grpc'  # http or grpc
    url = FLAGS.triton_server_url
    verbose = False
    model_name = FLAGS.triton_model_name
    model_version = str(FLAGS.triton_model_version)
    batch_size = FLAGS.predict_batch_size

    triton_client = tritongrpcclient.InferenceServerClient(url, verbose)
    model_metadata = triton_client.get_model_metadata(
        model_name=model_name, model_version=model_version)
    model_config = triton_client.get_model_config(model_name=model_name,
                                                  model_version=model_version)

    user_data = UserData()

    max_outstanding = 20
    # Number of outstanding requests
    outstanding = 0

    sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features))
    recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features))

    def process_outstanding(do_wait, outstanding):

        if (outstanding == 0 or do_wait is False):
            return outstanding

        # Wait for deferred items from callback functions
        (result, error, idx, start_time,
         inputs) = user_data._completed_requests.get()

        if (result is None):
            return outstanding

        stop = time.time()

        if (error is not None):
            raise ValueError(
                "Context returned null for async id marked as done")

        outstanding -= 1

        time_list.append(stop - start_time)

        batch_count = len(inputs[label_id_key])

        start_logits_results = result.as_numpy("start_logits")
        end_logits_results = result.as_numpy("end_logits")

        for i in range(batch_count):
            unique_id = int(inputs[label_id_key][i][0])
            start_logits = [float(x) for x in start_logits_results[i].flat]
            end_logits = [float(x) for x in end_logits_results[i].flat]
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

        recv_prog.update(n=batch_count)
        return outstanding

    all_results = []
    time_list = []

    print("Starting Sending Requests....\n")

    all_results_start = time.time()
    idx = 0
    for inputs_dict in batch(eval_features, batch_size):

        present_batch_size = len(inputs_dict[label_id_key])

        label_ids_data = np.stack(inputs_dict[label_id_key])
        input_ids_data = np.stack(inputs_dict['input_ids'])
        input_mask_data = np.stack(inputs_dict['input_mask'])
        segment_ids_data = np.stack(inputs_dict['segment_ids'])

        inputs = []
        inputs.append(
            tritongrpcclient.InferInput(label_id_key, label_ids_data.shape,
                                        "INT32"))
        inputs[0].set_data_from_numpy(label_ids_data)
        inputs.append(
            tritongrpcclient.InferInput('input_ids', input_ids_data.shape,
                                        "INT32"))
        inputs[1].set_data_from_numpy(input_ids_data)
        inputs.append(
            tritongrpcclient.InferInput('input_mask', input_mask_data.shape,
                                        "INT32"))
        inputs[2].set_data_from_numpy(input_mask_data)
        inputs.append(
            tritongrpcclient.InferInput('segment_ids', segment_ids_data.shape,
                                        "INT32"))
        inputs[3].set_data_from_numpy(segment_ids_data)

        outputs = []
        outputs.append(tritongrpcclient.InferRequestedOutput('start_logits'))
        outputs.append(tritongrpcclient.InferRequestedOutput('end_logits'))

        start_time = time.time()
        triton_client.async_infer(model_name,
                                  inputs,
                                  partial(completion_callback, user_data, idx,
                                          start_time, inputs_dict),
                                  request_id=str(idx),
                                  model_version=model_version,
                                  outputs=outputs)
        outstanding += 1
        idx += 1

        sent_prog.update(n=present_batch_size)

        # Try to process at least one response per request
        outstanding = process_outstanding(outstanding >= max_outstanding,
                                          outstanding)

    tqdm.tqdm.write(
        "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format(
            outstanding))

    # Now process all outstanding requests
    while (outstanding > 0):
        outstanding = process_outstanding(True, outstanding)

    all_results_end = time.time()
    all_results_total = (all_results_end - all_results_start) * 1000.0

    print("-----------------------------")
    print("Total Time: {} ms".format(all_results_total))
    print("-----------------------------")

    print("-----------------------------")
    print("Total Inference Time = %0.2f for"
          "Sentences processed = %d" % (sum(time_list), len(eval_features)))
    print("Throughput Average (sentences/sec) = %0.2f" %
          (len(eval_features) / all_results_total * 1000.0))
    print("-----------------------------")

    if FLAGS.output_dir and FLAGS.predict_file:
        # When inferencing on a dataset, get inference statistics and write results to json file
        time_list.sort()

        avg = np.mean(time_list)
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        print("-----------------------------")
        print("Summary Statistics")
        print("Batch size =", FLAGS.predict_batch_size)
        print("Sequence Length =", FLAGS.max_seq_length)
        print("Latency Confidence Level 95 (ms) =", cf_95 * 1000)
        print("Latency Confidence Level 99 (ms)  =", cf_99 * 1000)
        print("Latency Confidence Level 100 (ms)  =", cf_100 * 1000)
        print("Latency Average (ms)  =", avg * 1000)
        print("-----------------------------")

        output_prediction_file = os.path.join(FLAGS.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(FLAGS.output_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(FLAGS.output_dir,
                                                 "null_odds.json")

        write_predictions(eval_examples, eval_features, all_results,
                          FLAGS.n_best_size, FLAGS.max_answer_length,
                          FLAGS.do_lower_case, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file,
                          FLAGS.version_2_with_negative, FLAGS.verbose_logging)
    else:
        # When inferencing on a single example, write best answer to stdout
        all_predictions, all_nbest_json, scores_diff_json = get_predictions(
            eval_examples, eval_features, all_results, FLAGS.n_best_size,
            FLAGS.max_answer_length, FLAGS.do_lower_case,
            FLAGS.version_2_with_negative, FLAGS.verbose_logging)
        print(
            "Context is: %s \n\nQuestion is: %s \n\nPredicted Answer is: %s" %
            (FLAGS.context, FLAGS.question, all_predictions[0]))
def main():
    FLAGS = parse_args()
    try:
        triton_client = tritongrpcclient.InferenceServerClient(
            url=FLAGS.url, verbose=FLAGS.verbose)
    except Exception as e:
        print("channel creation failed: " + str(e))
        sys.exit(1)

    model_name = FLAGS.model_name
    model_version = -1

    print("Loading images")

    image_data, labels = load_images(
        FLAGS.img_dir if FLAGS.img_dir is not None else FLAGS.img)
    image_data = array_from_list(image_data)

    print("Images loaded, inferring")

    # Infer
    inputs = []
    outputs = []
    input_name = "INPUT"
    output_name = "OUTPUT"
    input_shape = list(image_data.shape)
    input_shape[0] = FLAGS.batch_size
    inputs.append(tritongrpcclient.InferInput(input_name, input_shape,
                                              "UINT8"))
    outputs.append(tritongrpcclient.InferRequestedOutput(output_name))

    img_idx = 0
    for batch in batcher(image_data, FLAGS.batch_size):
        print("Input mean before backend processing:", np.mean(batch))
        # Initialize the data
        inputs[0].set_data_from_numpy(batch)

        start = time.perf_counter()
        # Test with outputs
        results = triton_client.infer(model_name=model_name,
                                      inputs=inputs,
                                      outputs=outputs)

        # Get the output arrays from the results
        output0_data = results.as_numpy(output_name)
        end = time.perf_counter() - start
        print("latency: {:.6}ms".format(end * 1000))
        print("Output mean after backend processing:", np.mean(output0_data))
        print("Output shape: ", np.shape(output0_data))
        maxs = np.argmax(output0_data, axis=1)
        for i in range(len(maxs)):
            print("Sample ", i, " - label: ", maxs[i], " ~ ",
                  output0_data[i, maxs[i]])
            if maxs[i] != labels[img_idx]:
                sys.exit(1)
            else:
                print("pass")
            img_idx += 1

    statistics = triton_client.get_inference_statistics(model_name=model_name)
    if len(statistics.model_stats) != 1:
        print("FAILED: Inference Statistics")
        sys.exit(1)
Beispiel #11
0
def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name,
                  dtype):
    # Thread responsible for generating sequences of inference
    # requests.
    global _thread_exceptions

    print("Starting thread {} with seed {}".format(name, seed))
    rng = np.random.RandomState(seed)

    client_metadata_list = []

    try:
        # Must use streaming GRPC context to ensure each sequences'
        # requests are received in order. Create 2 common-use contexts
        # with different correlation IDs that are used for most
        # inference requests. Also create some rare-use contexts that
        # are used to make requests with rarely-used correlation IDs.
        #
        # Need to remember the last choice for each context since we
        # don't want some choices to follow others since that gives
        # results not expected. See below for details.
        common_cnt = 2
        rare_cnt = 8
        last_choices = []

        for c in range(common_cnt + rare_cnt):
            client_metadata_list.append(
                (grpcclient.InferenceServerClient("localhost:8001",
                                                  verbose=FLAGS.verbose),
                 correlation_id_base + c))
            last_choices.append(None)

        rare_idx = 0
        for p in range(pass_cnt):
            # Common or rare context?
            if rng.rand() < 0.1:
                # Rare context...
                choice = rng.rand()
                client_idx = common_cnt + rare_idx

                # Send a no-end, valid-no-end or valid-valid
                # sequence... because it is a rare context this should
                # exercise the idle sequence path of the sequence
                # scheduler
                if choice < 0.33:
                    sequence_no_end(client_metadata_list[client_idx],
                                    rng,
                                    trial,
                                    model_name,
                                    dtype,
                                    SEQUENCE_LENGTH_MEAN,
                                    SEQUENCE_LENGTH_STDEV,
                                    sequence_name=name)
                    last_choices[client_idx] = "no-end"
                elif choice < 0.66:
                    sequence_valid_no_end(client_metadata_list[client_idx],
                                          rng,
                                          trial,
                                          model_name,
                                          dtype,
                                          SEQUENCE_LENGTH_MEAN,
                                          SEQUENCE_LENGTH_STDEV,
                                          sequence_name=name)
                    last_choices[client_idx] = "valid-no-end"
                else:
                    sequence_valid_valid(client_metadata_list[client_idx],
                                         rng,
                                         trial,
                                         model_name,
                                         dtype,
                                         SEQUENCE_LENGTH_MEAN,
                                         SEQUENCE_LENGTH_STDEV,
                                         sequence_name=name)
                    last_choices[client_idx] = "valid-valid"

                rare_idx = (rare_idx + 1) % rare_cnt
            else:
                # Common context...
                client_idx = 0 if rng.rand() < 0.5 else 1
                client_metadata = client_metadata_list[client_idx]
                last_choice = last_choices[client_idx]

                choice = rng.rand()

                # no-start cannot follow no-end since the server will
                # just assume that the no-start is a continuation of
                # the no-end sequence instead of being a sequence
                # missing start flag.
                if ((last_choice != "no-end")
                        and (last_choice != "valid-no-end")
                        and (choice < 0.01)):
                    sequence_no_start(client_metadata,
                                      rng,
                                      trial,
                                      model_name,
                                      dtype,
                                      sequence_name=name)
                    last_choices[client_idx] = "no-start"
                elif choice < 0.05:
                    sequence_no_end(client_metadata,
                                    rng,
                                    trial,
                                    model_name,
                                    dtype,
                                    SEQUENCE_LENGTH_MEAN,
                                    SEQUENCE_LENGTH_STDEV,
                                    sequence_name=name)
                    last_choices[client_idx] = "no-end"
                elif choice < 0.10:
                    sequence_valid_no_end(client_metadata,
                                          rng,
                                          trial,
                                          model_name,
                                          dtype,
                                          SEQUENCE_LENGTH_MEAN,
                                          SEQUENCE_LENGTH_STDEV,
                                          sequence_name=name)
                    last_choices[client_idx] = "valid-no-end"
                elif choice < 0.15:
                    sequence_valid_valid(client_metadata,
                                         rng,
                                         trial,
                                         model_name,
                                         dtype,
                                         SEQUENCE_LENGTH_MEAN,
                                         SEQUENCE_LENGTH_STDEV,
                                         sequence_name=name)
                    last_choices[client_idx] = "valid-valid"
                else:
                    sequence_valid(client_metadata,
                                   rng,
                                   trial,
                                   model_name,
                                   dtype,
                                   SEQUENCE_LENGTH_MEAN,
                                   SEQUENCE_LENGTH_STDEV,
                                   sequence_name=name)
                    last_choices[client_idx] = "valid"

    except Exception as ex:
        _thread_exceptions_mutex.acquire()
        try:
            _thread_exceptions.append(traceback.format_exc())
        finally:
            _thread_exceptions_mutex.release()

    # We need to explicitly close each client so that streams get
    # cleaned up and closed correctly, otherwise the application
    # can hang when exiting.
    for c, i in client_metadata_list:
        print("thread {} closing client {}".format(name, i))
        c.close()

    print("Exiting thread {}".format(name))
Beispiel #12
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        '--trial',
                        type=str,
                        required=True,
                        help='Set trial for the crashing client')
    FLAGS = parser.parse_args()
    trial = FLAGS.trial

    dtype = np.float32
    model_name = tu.get_zero_model_name(trial, 1, dtype)
    tensor_shape = (1,) if "nobatch" in trial else (1, 1)

    triton_client = grpcclient.InferenceServerClient(url="localhost:8001",
                                                     verbose=True)

    shm = shared_memory.SharedMemory(create=True, size=8)
    count = np.ndarray((1,), dtype=np.int32, buffer=shm.buf)
    count[0] = 0

    p = Process(target=crashing_client,
                name="crashing_client",
                args=(
                    model_name,
                    dtype,
                    tensor_shape,
                    shm.name,
                    triton_client,
                ))
Beispiel #13
0
def stress_thread(name, seed, test_duration, correlation_id_base,
                  test_case_count, failed_test_case_count,
                  sequence_request_count):
    # Thread responsible for generating sequences of inference
    # requests.
    global _thread_exceptions

    print("Starting thread {} with seed {}".format(name, seed))
    rng = np.random.RandomState(seed)

    # FIXME revisit to check if it is necessary
    client_metadata_list = []

    # Must use streaming GRPC context to ensure each sequences'
    # requests are received in order. Create 2 common-use contexts
    # with different correlation IDs that are used for most
    # inference requests. Also create some rare-use contexts that
    # are used to make requests with rarely-used correlation IDs.
    #
    # Need to remember if the last sequence case runs on each model
    # is no-end cases since we don't want some choices to follow others
    # since that gives results not expected. See below for details.
    common_cnt = 2
    rare_cnt = 8
    is_last_used_no_end = {}

    update_counter_fn = partial(update_test_count, test_case_count,
                                failed_test_case_count, sequence_request_count)
    for c in range(common_cnt + rare_cnt):
        client_metadata_list.append(
            (grpcclient.InferenceServerClient("localhost:8001",
                                              verbose=FLAGS.verbose),
             correlation_id_base + c))
    pa_start_seq_id = correlation_id_base + common_cnt + rare_cnt
    pa_end_seq_id = pa_start_seq_id + CORRELATION_ID_BLOCK_SIZE

    # Weight roughly in thousandth percent
    ss = ScenarioSelector([
        (60, TimeoutScenario(name, get_trials(False), verbose=FLAGS.verbose)),
        (80, ResNetScenario(name, verbose=FLAGS.verbose)),
        (60, CrashingScenario(name, verbose=FLAGS.verbose)),
        (62,
         SequenceNoEndScenario(name,
                               get_trials(),
                               rng,
                               is_last_used_no_end,
                               verbose=FLAGS.verbose)),
        (68,
         SequenceValidNoEndScenario(name,
                                    get_trials(),
                                    rng,
                                    is_last_used_no_end,
                                    verbose=FLAGS.verbose)),
        (68,
         SequenceValidValidScenario(name,
                                    get_trials(),
                                    rng,
                                    is_last_used_no_end,
                                    verbose=FLAGS.verbose)),
        (7,
         SequenceNoStartScenario(name,
                                 get_trials(),
                                 rng,
                                 is_last_used_no_end,
                                 verbose=FLAGS.verbose)),
        (295,
         SequenceValidScenario(name,
                               get_trials(),
                               rng,
                               is_last_used_no_end,
                               verbose=FLAGS.verbose)),
        (300,
         PerfAnalyzerScenario(
             name,
             rng,
             get_trials(),
             get_trials(False),
             sequence_id_range=(pa_start_seq_id, pa_end_seq_id),
             verbose=FLAGS.verbose)),
    ], rng)

    rare_idx = 0
    common_idx = 0
    start_time = time.time()
    while time.time() - start_time < test_duration:
        scenario = ss.get_scenario()
        # FIXME generating 'is_rare' for now as some scenario uses it to select
        # client context, but we may not need this if we roll forward the sequence id
        if rng.rand() < 0.1:
            client_idx = common_cnt + rare_idx
            rare_idx = (rare_idx + 1) % rare_cnt
        else:
            client_idx = common_idx
            common_idx = (common_idx + 1) % common_cnt

        try:
            res = scenario.run(client_metadata_list[client_idx])
            if res is not None:
                update_counter_fn(scenario.scenario_name(), count=res)
        except Exception as ex:
            update_counter_fn(scenario.scenario_name(), False)
            _thread_exceptions_mutex.acquire()
            try:
                _thread_exceptions.append(traceback.format_exc())
            finally:
                _thread_exceptions_mutex.release()

    # We need to explicitly close each client so that streams get
    # cleaned up and closed correctly, otherwise the application
    # can hang when exiting.
    for c, i in client_metadata_list:
        print("thread {} closing client {}".format(name, i))
        c.close()

    print("Exiting thread {}".format(name))
Beispiel #14
0
    def test_nobatch_request_for_batching_model(self):
        input_size = 16

        # graphdef_int32_int8_int8 has a batching version with max batch size of 8.
        # The server should return an error if the batch size is not included in the
        # input shapes.
        tensor_shape = (input_size, )
        for protocol in ["http", "grpc"]:
            model_name = tu.get_model_name("graphdef", np.int32, np.int8,
                                           np.int8)
            in0 = np.random.randint(low=0,
                                    high=100,
                                    size=tensor_shape,
                                    dtype=np.int32)
            in1 = np.random.randint(low=0,
                                    high=100,
                                    size=tensor_shape,
                                    dtype=np.int32)

            inputs = []
            outputs = []
            if protocol == "http":
                triton_client = tritonhttpclient.InferenceServerClient(
                    url='localhost:8000', verbose=True)
                inputs.append(
                    tritonhttpclient.InferInput('INPUT0', tensor_shape,
                                                "INT32"))
                inputs.append(
                    tritonhttpclient.InferInput('INPUT1', tensor_shape,
                                                "INT32"))
                outputs.append(
                    tritonhttpclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(
                    tritonhttpclient.InferRequestedOutput('OUTPUT1'))
            else:
                triton_client = tritongrpcclient.InferenceServerClient(
                    url='localhost:8001', verbose=True)
                inputs.append(
                    tritongrpcclient.InferInput('INPUT0', tensor_shape,
                                                "INT32"))
                inputs.append(
                    tritongrpcclient.InferInput('INPUT1', tensor_shape,
                                                "INT32"))
                outputs.append(
                    tritongrpcclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(
                    tritongrpcclient.InferRequestedOutput('OUTPUT1'))

            # Initialize the data
            inputs[0].set_data_from_numpy(in0)
            inputs[1].set_data_from_numpy(in1)

            try:
                results = triton_client.infer(model_name,
                                              inputs,
                                              outputs=outputs)
                self.assertTrue(
                    False,
                    "expected failure with no batch request for batching model"
                )
            except InferenceServerException as ex:
                pass
Beispiel #15
0
    def test_model_specific_infer(self):
        input_size = 16
        tensor_shape = (1, input_size)

        # There are 3 versions of *_float32_float32_float32 but only
        # versions 1 and 3 should be available.
        for platform in ('graphdef', 'netdef', 'plan'):
            tensor_shape = (1, input_size)
            model_name = platform + "_float32_float32_float32"

            # Initially there should be no version status...
            try:
                for pair in [("localhost:8000", "http"),
                             ("localhost:8001", "grpc")]:
                    if pair[1] == "http":
                        triton_client = httpclient.InferenceServerClient(
                            url=pair[0], verbose=True)
                    else:
                        triton_client = grpcclient.InferenceServerClient(
                            url=pair[0], verbose=True)

                    self.assertTrue(triton_client.is_server_live())
                    self.assertTrue(triton_client.is_server_ready())
                    self.assertTrue(
                        triton_client.is_model_ready(model_name,
                                                     model_version="1"))
                    self.assertFalse(
                        triton_client.is_model_ready(model_name,
                                                     model_version="2"))
                    self.assertTrue(
                        triton_client.is_model_ready(model_name,
                                                     model_version="3"))
            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))

            # Infer using version 1...
            iu.infer_exact(self,
                           platform,
                           tensor_shape,
                           1,
                           np.float32,
                           np.float32,
                           np.float32,
                           model_version=1,
                           swap=False)

            try:
                for pair in [("localhost:8000", "http"),
                             ("localhost:8001", "grpc")]:
                    if pair[1] == "http":
                        triton_client = httpclient.InferenceServerClient(
                            url=pair[0], verbose=True)
                    else:
                        triton_client = grpcclient.InferenceServerClient(
                            url=pair[0], verbose=True)

                    self.assertTrue(triton_client.is_server_live())
                    self.assertTrue(triton_client.is_server_ready())
                    self.assertTrue(
                        triton_client.is_model_ready(model_name,
                                                     model_version="1"))
                    self.assertFalse(
                        triton_client.is_model_ready(model_name,
                                                     model_version="2"))
                    self.assertTrue(
                        triton_client.is_model_ready(model_name,
                                                     model_version="3"))

                    # Only version 1 should have infer stats
                    infer_stats = triton_client.get_inference_statistics(
                        model_name, model_version='1')
                    if pair[1] == "http":
                        self.assertEqual(
                            len(infer_stats['model_stats']), 1,
                            "expected 1 infer stats for version 1"
                            " of model " + model_name)
                        stats = infer_stats['model_stats'][0][
                            'inference_stats']
                        self.assertTrue(stats['success']['count'], 3)
                    else:
                        self.assertEqual(
                            len(infer_stats.model_stats), 1,
                            "expected 1 infer stats for version 1"
                            " of model " + model_name)
                        stats = infer_stats.model_stats[0].inference_stats
                        self.assertTrue(stats.success.count, 3)
                    infer_stats = triton_client.get_inference_statistics(
                        model_name, model_version='3')
                    if pair[1] == "http":
                        stats = infer_stats['model_stats'][0][
                            'inference_stats']
                        self.assertEqual(
                            stats['success']['count'], 0,
                            "unexpected infer stats for version 3"
                            " of model " + model_name)
                    else:
                        stats = infer_stats.model_stats[0].inference_stats
                        self.assertEqual(
                            stats.success.count, 0,
                            "unexpected infer stats for version 3"
                            " of model " + model_name)

            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))
    def _decoupled_infer(self,
                         request_count,
                         repeat_count=1,
                         data_offset=100,
                         delay_time=1000,
                         wait_time=500):
        # Initialize data for IN
        input_data = np.arange(start=data_offset,
                               stop=data_offset + repeat_count,
                               dtype=np.int32)
        input_data = np.expand_dims(input_data, axis=0)
        self.inputs_[0].set_shape([1, repeat_count])
        self.inputs_[0].set_data_from_numpy(input_data)

        # Initialize data for DELAY
        delay_data = (np.ones([1, repeat_count], dtype=np.uint32)) * delay_time
        self.inputs_[1].set_shape([1, repeat_count])
        self.inputs_[1].set_data_from_numpy(delay_data)

        # Initialize data for WAIT
        wait_data = np.array([[wait_time]], dtype=np.uint32)
        self.inputs_[2].set_data_from_numpy(wait_data)

        user_data = UserData()
        result_dict = {}

        with grpcclient.InferenceServerClient(url="localhost:8001",
                                              verbose=True) as triton_client:
            # Establish stream
            triton_client.start_stream(callback=partial(callback, user_data))
            # Send specified many requests in parallel
            for i in range(request_count):
                triton_client.async_stream_infer(model_name=self.model_name_,
                                                 inputs=self.inputs_,
                                                 request_id=str(i),
                                                 outputs=self.outputs_)

            # Retrieve results...
            recv_count = 0
            while recv_count < (repeat_count * request_count):
                data_item = user_data._completed_requests.get()
                if type(data_item) == InferenceServerException:
                    raise data_item
                else:
                    this_id = data_item.get_response().id
                    if this_id not in result_dict.keys():
                        result_dict[this_id] = []
                    result_dict[this_id].append(data_item.as_numpy('OUT'))
                recv_count += 1

        # Validate the results..
        for i in range(request_count):
            this_id = str(i)
            if repeat_count != 0 and this_id not in result_dict.keys():
                self.assertTrue(
                    False,
                    "response for request id {} not received".format(this_id))
            elif repeat_count == 0 and this_id in result_dict.keys():
                self.assertTrue(
                    False,
                    "received unexpected response for request id {}".format(
                        this_id))
            if repeat_count != 0:
                self.assertEqual(len(result_dict[this_id]), repeat_count)

                expected_data = data_offset
                result_list = result_dict[this_id]
                for j in range(len(result_list)):
                    self.assertEqual(len(result_list[j]), 1)
                    self.assertEqual(result_list[j][0], expected_data)
                    expected_data += 1
Beispiel #17
0
    def test_model_versions_added(self):
        # Originally There was version 1 of *_float16_float32_float32.
        # Version 7 was added so now expect just version 7 to be ready.
        for platform in ('graphdef', ):
            model_name = platform + "_float16_float32_float32"

            try:
                for pair in [("localhost:8000", "http"),
                             ("localhost:8001", "grpc")]:
                    if pair[1] == "http":
                        triton_client = httpclient.InferenceServerClient(
                            url=pair[0], verbose=True)
                    else:
                        triton_client = grpcclient.InferenceServerClient(
                            url=pair[0], verbose=True)

                    self.assertTrue(triton_client.is_server_live())
                    self.assertTrue(triton_client.is_server_ready())
                    model_metadata = triton_client.get_model_metadata(
                        model_name)
                    if pair[1] == "http":
                        self.assertEqual(
                            model_name, model_metadata['name'],
                            "expected status for model " + model_name)
                        self.assertEqual(
                            len(model_metadata['versions']), 1,
                            "expected status for 1 versions for model " +
                            model_name)
                        self.assertEqual("7", model_metadata['versions'][0])
                    else:
                        self.assertEqual(
                            model_name, model_metadata.name,
                            "expected status for model " + model_name)
                        self.assertEqual(
                            len(model_metadata.versions), 1,
                            "expected status for 1 versions for model " +
                            model_name)
                        self.assertEqual("7", model_metadata.versions[0])

                    # Only version 7 should be ready. Neither should have infer stats
                    for v in (1, 7):
                        infer_stats = triton_client.get_inference_statistics(
                            model_name, model_version=str(v))
                        if v == 7:
                            self.assertTrue(
                                triton_client.is_model_ready(
                                    model_name, model_version=str(v)))
                        else:
                            self.assertFalse(
                                triton_client.is_model_ready(
                                    model_name, model_version=str(v)))

                        if pair[1] == "http":
                            stats = infer_stats['model_stats'][0][
                                'inference_stats']
                            self.assertEqual(
                                stats['success']['count'], 0,
                                "unexpected infer stats for version " +
                                str(v) + " of model " + model_name)
                        else:
                            stats = infer_stats.model_stats[0].inference_stats
                            self.assertEqual(
                                stats.success.count, 0,
                                "unexpected infer stats for version " +
                                str(v) + " of model " + model_name)

            except InferenceServerException as ex:
                self.assertTrue(False, "unexpected error {}".format(ex))
Beispiel #18
0
def infer_zero(tester,
               pf,
               batch_size,
               tensor_dtype,
               input_shapes,
               output_shapes,
               model_version=None,
               use_http=True,
               use_grpc=True,
               use_http_json_tensors=True,
               use_streaming=True,
               shm_region_name_prefix=None,
               use_system_shared_memory=False,
               use_cuda_shared_memory=False,
               priority=0,
               timeout_us=0):
    tester.assertTrue(use_http or use_grpc or use_http_json_tensors
                      or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False, True))
    if use_http_json_tensors and (tensor_dtype != np.float16):
        configs.append(("localhost:8000", "http", False, False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False, False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True, False))
    tester.assertEqual(len(input_shapes), len(output_shapes))
    io_cnt = len(input_shapes)

    if shm_region_name_prefix is None:
        shm_region_name_prefix = ["input", "output"]

    input_dict = {}
    expected_dict = {}
    shm_ip_handles = list()
    shm_op_handles = list()

    for io_num in range(io_cnt):
        if pf == "libtorch" or pf == "libtorch_nobatch":
            input_name = "INPUT__{}".format(io_num)
            output_name = "OUTPUT__{}".format(io_num)
        else:
            input_name = "INPUT{}".format(io_num)
            output_name = "OUTPUT{}".format(io_num)

        input_shape = input_shapes[io_num]
        output_shape = output_shapes[io_num]

        rtensor_dtype = _range_repr_dtype(tensor_dtype)
        if (rtensor_dtype != np.bool):
            input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                            high=np.iinfo(rtensor_dtype).max,
                                            size=input_shape,
                                            dtype=rtensor_dtype)
        else:
            input_array = np.random.choice(a=[False, True], size=input_shape)
        if tensor_dtype != np.object:
            input_array = input_array.astype(tensor_dtype)
            expected_array = np.ndarray.copy(input_array)
        else:
            expected_array = np.array([
                unicode(str(x), encoding='utf-8')
                for x in input_array.flatten()
            ],
                                      dtype=object)
            input_array = np.array([str(x) for x in input_array.flatten()],
                                   dtype=object).reshape(input_array.shape)

        expected_array = expected_array.reshape(output_shape)
        expected_dict[output_name] = expected_array

        output_byte_size = expected_array.nbytes

        if batch_size == 1:
            input_list = [input_array]
        else:
            input_list = [x for x in input_array]

        # Serialization of string tensors in the case of shared memory must be done manually
        if tensor_dtype == np.object:
            input_list_tmp = serialize_byte_tensor_list(input_list)
        else:
            input_list_tmp = input_list

        input_byte_size = sum([ip.nbytes for ip in input_list_tmp])

        # create and register shared memory region for inputs and outputs
        shm_io_handles = su.create_set_either_shm_region(
            [
                shm_region_name_prefix[0] + str(io_num),
                shm_region_name_prefix[1] + str(io_num)
            ], input_list_tmp, input_byte_size, output_byte_size,
            use_system_shared_memory, use_cuda_shared_memory)

        if len(shm_io_handles) != 0:
            shm_ip_handles.append(shm_io_handles[0])
            shm_op_handles.append(shm_io_handles[1])
        input_dict[input_name] = input_array

    if model_version is not None:
        model_version = str(model_version)
    else:
        model_version = ""

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)

        if config[1] == "http":
            triton_client = httpclient.InferenceServerClient(config[0],
                                                             verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(config[0],
                                                             verbose=True)

        inputs = []
        output_req = []
        for io_num, (input_name, output_name) in enumerate(
                zip(input_dict.keys(), expected_dict.keys())):
            input_data = input_dict[input_name]
            input_byte_size = input_data.nbytes
            output_byte_size = expected_dict[output_name].nbytes
            if config[1] == "http":
                inputs.append(
                    httpclient.InferInput(input_name, input_data.shape,
                                          np_to_triton_dtype(tensor_dtype)))
                output_req.append(
                    httpclient.InferRequestedOutput(output_name,
                                                    binary_data=config[3]))
            else:
                inputs.append(
                    grpcclient.InferInput(input_name, input_data.shape,
                                          np_to_triton_dtype(tensor_dtype)))
                output_req.append(grpcclient.InferRequestedOutput(output_name))

            if not (use_cuda_shared_memory or use_system_shared_memory):
                if config[1] == "http":
                    inputs[-1].set_data_from_numpy(input_data,
                                                   binary_data=config[3])
                else:
                    inputs[-1].set_data_from_numpy(input_data)
            else:
                # Register necessary shared memory regions/handles
                su.register_add_either_shm_regions(
                    inputs, output_req, shm_region_name_prefix,
                    (shm_ip_handles, shm_op_handles), io_num, input_byte_size,
                    output_byte_size, use_system_shared_memory,
                    use_cuda_shared_memory, triton_client)

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(
                    model_name,
                    inputs,
                    model_version=model_version,
                    outputs=output_req,
                    request_id=str(_unique_request_id()),
                    priority=priority,
                    timeout=timeout_us)
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()),
                                          priority=priority,
                                          timeout=timeout_us)

        last_response = results.get_response()

        if config[1] == "http":
            response_model_name = last_response["model_name"]
            if model_version != "":
                response_model_version = last_response["model_version"]
            response_outputs = last_response["outputs"]
        else:
            response_model_name = last_response.model_name
            if model_version != "":
                response_model_version = last_response.model_version
            response_outputs = last_response.outputs

        tester.assertEqual(response_model_name, model_name)

        if model_version != "":
            tester.assertEqual(response_model_version, model_version)

        tester.assertEqual(len(response_outputs), io_cnt)

        for result in response_outputs:
            if config[1] == "http":
                result_name = result["name"]
            else:
                result_name = result.name

            tester.assertTrue(result_name in expected_dict)
            if use_system_shared_memory or use_cuda_shared_memory:
                if pf == "libtorch" or pf == "libtorch_nobatch":
                    io_num = int(result_name.split("OUTPUT__")[1])
                else:
                    io_num = int(result_name.split("OUTPUT")[1])
                shm_handle = shm_op_handles[io_num]

                output = results.get_output(result_name)
                if config[1] == "http":
                    output_datatype = output['datatype']
                    output_shape = output['shape']
                else:
                    output_datatype = output.datatype
                    output_shape = output.shape
                output_dtype = triton_to_np_dtype(output_datatype)
            if use_system_shared_memory:
                output_data = shm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            elif use_cuda_shared_memory:
                output_data = cudashm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            else:
                output_data = results.as_numpy(result_name)

            if (output_data.dtype == np.object) and (config[3] == False):
                output_data = output_data.astype(np.bytes_)

            expected = expected_dict[result_name]
            tester.assertEqual(output_data.shape, expected.shape)
            tester.assertTrue(
                np.array_equal(output_data, expected),
                "{}, {}, expected: {}, got {}".format(model_name, result_name,
                                                      expected, output_data))

    if len(shm_ip_handles) != 0:
        for io_num in range(io_cnt):
            if use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0] + str(io_num) + '_data')
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0] + str(io_num) + '_data')
                cudashm.destroy_shared_memory_region(shm_ip_handles[io_num])
                cudashm.destroy_shared_memory_region(shm_op_handles[io_num])
            else:
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1] + str(io_num) + '_data')
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1] + str(io_num) + '_data')
                shm.destroy_shared_memory_region(shm_ip_handles[io_num])
                shm.destroy_shared_memory_region(shm_op_handles[io_num])

    return results
 def setUp(self):
     # The helper client for setup will be GRPC for simplicity.
     self.triton_client_ = grpcclient.InferenceServerClient("localhost:8001")
     self.clear_deferred_exceptions()
    parser = argparse.ArgumentParser()
    parser.add_argument('--expected_dir',
                        type=str,
                        required=True,
                        help='Directory containing expected output files')
    parser.add_argument('--model', type=str, required=True, help='Model name')
    FLAGS, unparsed = parser.parse_known_args()

    for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]:
        model_name = FLAGS.model
        if pair[1] == "http":
            triton_client = httpclient.InferenceServerClient(url=pair[0],
                                                             verbose=False)
            model_config = triton_client.get_model_config(model_name)
        else:
            triton_client = grpcclient.InferenceServerClient(url=pair[0],
                                                             verbose=False)
            model_config = triton_client.get_model_config(model_name)

        nonmatch = list()
        expected_files = [
            f for f in os.listdir(FLAGS.expected_dir)
            if (os.path.isfile(os.path.join(FLAGS.expected_dir, f)) and (
                f.startswith("expected")))
        ]
        for efile in expected_files:
            with open(os.path.join(FLAGS.expected_dir, efile)) as f:
                config = text_format.Parse(f.read(), mc.ModelConfig())

            if pair[1] == "http":
                config_json = json.loads(
                    json_format.MessageToJson(