def test_http_out_of_shared_memory(self):
        triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
        inputs = []
        inputs.append(tritonhttpclient.InferInput('INPUT', [1], "UINT8"))
        inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8))

        # Set up too small CUDA shared memory for outputs, expect query
        # returns default value
        triton_client.unregister_system_shared_memory()
        triton_client.unregister_cuda_shared_memory()
        shm_op0_handle = cudashm.create_shared_memory_region(
            "output0_data", 1, 0)
        shm_op1_handle = cudashm.create_shared_memory_region(
            "output1_data", 1, 0)
        triton_client.register_cuda_shared_memory(
            "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1)
        triton_client.register_cuda_shared_memory(
            "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1)
        outputs = []
        outputs.append(
            tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
        outputs[-1].set_shared_memory("output0_data", 1)

        outputs.append(
            tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True))
        outputs[-1].set_shared_memory("output1_data", 1)

        try:
            triton_client.infer(model_name="query",
                                inputs=inputs,
                                outputs=outputs)
            self.assertTrue(False, "expect error with query information")
        except InferenceServerException as ex:
            self.assertTrue("OUTPUT0 CPU 0" in ex.message())
            self.assertTrue("OUTPUT1 CPU 0" in ex.message())

        cudashm.destroy_shared_memory_region(shm_op0_handle)
        cudashm.destroy_shared_memory_region(shm_op1_handle)
        triton_client.unregister_system_shared_memory()
        triton_client.unregister_cuda_shared_memory()
Example #2
0
    def _request_generator(cls, batched_image_data):
        """ Set the input data """
        inputs = [
            httpclient.InferInput(cls.INPUT_NAME, batched_image_data.shape,
                                  cls.DTYPE)
        ]
        inputs[0].set_data_from_numpy(batched_image_data, binary_data=True)

        outputs = [
            httpclient.InferRequestedOutput(output_name, binary_data=True)
            for output_name in cls.OUTPUT_NAMES
        ]
        yield inputs, outputs
Example #3
0
def run_inference(X, X_shape=(1, 3, 224,  224), X_dtype='FP32', model_name='cub200_resnet34', input_name=['INPUT__0'], output_name='OUTPUT__0',
                  url='ecm-clearml-compute-gpu-002.westeurope.cloudapp.azure.com', model_version='1', port=8000, VERBOSE=False):
    url = url+':'+str(port)
    triton_client = http.InferenceServerClient(url=url, verbose=VERBOSE)
  
    input0 = http.InferInput(input_name[0], X_shape, X_dtype)
    input0.set_data_from_numpy(X, binary_data=False)
    output = http.InferRequestedOutput(output_name,  binary_data=False)
    response = triton_client.infer(model_name, model_version=model_version, inputs=[input0], outputs=[output])
    y_pred_proba = response.as_numpy(output_name)
    y_pred = y_pred_proba.argmax(1)

    return y_pred_proba, y_pred