async def predict(self, request: InferenceRequest) -> InferenceResponse: request_dict = request.dict() insights_wrapper = InsightsWrapper(self.insights_manager) # TODO: Add request_id, response_headers, request_headers, etc payload_context = PayloadContext(request_id=request.id, request=request_dict) tempo_wrapper = TempoContextWrapper(payload_context, insights_wrapper, self.state) tempo_context.set(tempo_wrapper) response_dict = self._model.request(request_dict) if self._is_coroutine: response_dict = await response_dict # type: ignore # TODO: Ensure model_version is added by mlserver response_dict["model_version"] = "NOTIMPLEMENTED" # TODO: Move to functions declared upfront with logic contained to avoid if if self._model.get_insights_mode == InsightRequestModes.ALL: insights_wrapper.log(request_dict, insights_type=InsightsTypes.INFER_REQUEST) insights_wrapper.log(response_dict, insights_type=InsightsTypes.INFER_RESPONSE) else: if self._model.get_insights_mode == InsightRequestModes.REQUEST or insights_wrapper.set_log_request: insights_wrapper.log(request_dict, insights_type=InsightsTypes.INFER_REQUEST) if self._model.get_insights_mode == InsightRequestModes.RESPONSE or insights_wrapper.set_log_response: insights_wrapper.log( response_dict, insights_type=InsightsTypes.INFER_RESPONSE) return InferenceResponse(**response_dict)
def xgboost_inference_request(inference_request: InferenceRequest) -> InferenceRequest: # Reshape to 2D array, matching the input data to xgboost_model single_input = inference_request.inputs[0] single_input.data = single_input.data = [[1, 2, 3]] # Keep only a single input inference_request.inputs = [single_input] return inference_request
def xgboost_inference_request( inference_request: InferenceRequest) -> InferenceRequest: # Reshape to 2D array, matching the input data to xgboost_model single_input = inference_request.inputs[0] single_input.data = TensorData.parse_obj([[1, 2, 3]]) single_input.shape = [1, 3] # Keep only a single input inference_request.inputs = [single_input] return inference_request
def test_content_types(tensor_spec: TensorSpec, request_input: RequestInput): input_schema = Schema(inputs=[tensor_spec]) inference_request = InferenceRequest( parameters=Parameters(content_type=PandasCodec.ContentType), inputs=[request_input], ) data = decode_inference_request(inference_request) # _enforce_schema will raise if something fails _enforce_schema(data, input_schema)
async def _send_request(): # Change the UUID so that it's a new one pred_id = generate_uuid() # Generate random data to ensure we catch any out-of-order issues request_input = inference_request.inputs[0] request_input.data = TensorData( __root__=[random.randint(1, 100) for _ in range(3)] ) new_req = InferenceRequest(id=pred_id, inputs=[request_input]) internal_id, _ = await adaptive_batcher._queue_request(new_req) return internal_id, new_req
def test_decode_request_inputs(sum_model_settings: ModelSettings, request_input: RequestInput, expected: Any): request = InferenceRequest(inputs=[request_input]) request = codec_middleware(request, sum_model_settings) if expected is None: assert not request.inputs[0].parameters else: decoded = getattr(request.inputs[0].parameters, DecodedParameterName) if isinstance(expected, np.ndarray): np.testing.assert_array_equal(decoded, expected) # type: ignore else: assert decoded == expected # type: ignore
def _check_request( self, payload: types.InferenceRequest) -> types.InferenceRequest: if len(payload.inputs) != 1: raise InferenceError( "SKLearnModel only supports a single input tensor " f"({len(payload.inputs)} were received)") if not payload.outputs: # By default, only return the result of `predict()` payload.outputs = [types.RequestOutput(name=PREDICT_OUTPUT)] else: for request_output in payload.outputs: if request_output.name not in VALID_OUTPUTS: raise InferenceError( f"SKLearnModel only supports '{PREDICT_OUTPUT}' and " f"'{PREDICT_PROBA_OUTPUT}' as outputs " f"({request_output.name} was received)") return payload
async def test_predict_pytorch(runtime_pytorch: MLflowRuntime): # The model used here is the MNIST pytorch example in mlflow: # https://github.com/mlflow/mlflow/tree/master/examples/pytorch/MNIST # input is a 28*28 image data = np.random.randn(1, 28 * 28).astype(np.float32) inference_request = InferenceRequest( parameters=Parameters(content_type=NumpyCodec.ContentType), inputs=[ RequestInput( name="predict", shape=data.shape, data=data.tolist(), datatype="FP32", ) ], ) response = await runtime_pytorch.predict(inference_request) outputs = response.outputs assert len(outputs) == 1 assert outputs[0].name == DefaultOutputName
async def predict(self, payload: InferenceRequest) -> InferenceResponse: print("------ Encoded Input (request) ------") as_dict = payload.dict(exclude=_to_exclude) # type: ignore print(json.dumps(as_dict, indent=2)) print("------ Decoded input (request) ------") decoded_request = None if payload.parameters: decoded_request = getattr(payload.parameters, DecodedParameterName) print(decoded_request) outputs = [] for request_input in payload.inputs: outputs.append( ResponseOutput( name=request_input.name, datatype=request_input.datatype, shape=request_input.shape, data=request_input.data, )) return InferenceResponse(model_name=self.name, outputs=outputs)
async def predict(self, request: InferenceRequest) -> InferenceResponse: insights_wrapper = InsightsWrapper(self.insights_manager) insights_context.set(insights_wrapper) request_dict = request.dict() response_dict = self._model.request(request_dict) if self._is_coroutine: response_dict = await response_dict # type: ignore # TODO: Move to functions declared upfront with logic contained to avoid if if self._model.get_insights_mode == InsightRequestModes.ALL: self.insights_manager.log(request_dict) self.insights_manager.log(response_dict) else: if self._model.get_insights_mode == InsightRequestModes.REQUEST or insights_wrapper.set_log_request: self.insights_manager.log(request_dict) if self._model.get_insights_mode == InsightRequestModes.RESPONSE or insights_wrapper.set_log_response: self.insights_manager.log(response_dict) return InferenceResponse(**response_dict)
async def test_batcher_cancels_responses( adaptive_batcher: AdaptiveBatcher, mocker, ): message = "This is an error" async def _async_exception(): raise Exception(message) num_requests = adaptive_batcher._max_batch_size * 2 + 2 adaptive_batcher._batcher = mocker.stub("_batcher") adaptive_batcher._batcher.side_effect = iter(_async_exception, None) requests = [ InferenceRequest( id=generate_uuid(), inputs=[ RequestInput( name="input-0", shape=[1, 3], datatype="INT32", data=[idx, idx + 1, idx + 2], ) ], ) for idx in range(num_requests) ] responses = await asyncio.gather( *[adaptive_batcher.predict(request) for request in requests], return_exceptions=True, ) for response in responses: assert isinstance(response, Exception) assert str(response) == message
from mlserver.codecs import NumpyCodec, StringCodec, PandasCodec from mlserver.codecs.middleware import DecodedParameterName, codec_middleware from mlserver.settings import ModelSettings @pytest.mark.parametrize( "inference_request, expected", [ ( InferenceRequest( parameters=Parameters(content_type=PandasCodec.ContentType), inputs=[ RequestInput( name="foo", shape=[2, 2], data=[1, 2, 3, 4], datatype="INT32", parameters=Parameters( content_type=NumpyCodec.ContentType), ), ], ), pd.DataFrame({"foo": [[1, 2], [3, 4]]}), ), ( InferenceRequest( parameters=Parameters(content_type=PandasCodec.ContentType), inputs=[ RequestInput( name="foo", shape=[2, 2],
assert merged == expected_request_input assert batched._minibatch_sizes == expected_minibatch_sizes @pytest.mark.parametrize( "inference_requests, expected", [ ( { "req-1": InferenceRequest( parameters=Parameters(content_type="np"), inputs=[ RequestInput(name="foo", datatype="INT32", data=[1, 2, 3], shape=[1, 3]) ], ), "req-2": InferenceRequest( parameters=Parameters(foo="bar"), inputs=[ RequestInput(name="foo", datatype="INT32", data=[4, 5, 6], shape=[1, 3]) ], ), },
def test_first_input_decode(inference_request: InferenceRequest, expected: np.ndarray): inference_request.inputs = [inference_request.inputs[0]] first_input = NumpyRequestCodec.decode(inference_request) np.testing.assert_equal(first_input, expected)
assert inference_response == expected @pytest.mark.parametrize( "inference_request, expected", [ ( InferenceRequest(inputs=[ RequestInput( name="a", data=[1, 2, 3], datatype="FP32", shape=[1, 3], parameters=Parameters( _decoded_payload=np.array([[1, 2, 3]])), ), RequestInput( name="b", data=b"hello world", datatype="BYTES", shape=[1, 11], parameters=Parameters(_decoded_payload=["hello world"]), ), ]), pd.DataFrame({ "a": [np.array([1, 2, 3])], "b": ["hello world"] }), ), ( InferenceRequest(inputs=[
def inference_request() -> InferenceRequest: return InferenceRequest(inputs=[ RequestInput( name="payload", shape=[4], data=[1, 2, 3, 4], datatype="FP32") ])
def inference_request(model_settings: ModelSettings) -> InferenceRequest: payload_path = os.path.join(TESTDATA_PATH, "inference-request.json") inference_request = InferenceRequest.parse_file(payload_path) return codec_middleware(inference_request, model_settings)
async def predict(self, payload: InferenceRequest) -> InferenceResponse: prediction = self._pipeline.request(payload.dict()) return InferenceResponse(**prediction)
from mlserver.codecs.base import CodecError from mlserver.codecs.utils import ( FirstInputRequestCodec, DecodedParameterName, ) from mlserver.codecs.numpy import NumpyRequestCodec @pytest.mark.parametrize( "inference_request, expected", [ ( InferenceRequest( inputs=[ RequestInput( name="foo", shape=[2, 2], data=[1, 2, 3, 4], datatype="INT32" ) ] ), np.array([[1, 2], [3, 4]]), ), ( InferenceRequest( inputs=[ RequestInput( name="foo", shape=[2, 2], data=[1, 2, 3, 4], datatype="INT32", parameters=Parameters(**{DecodedParameterName: np.array([23])}), )
for response in responses: assert isinstance(response, Exception) assert str(response) == message @pytest.mark.parametrize( "requests", [ [ InferenceRequest( id=f"request-{idx}", inputs=[ RequestInput( name="input-0", shape=[1, 3], datatype="INT32", data=[idx, idx + 1, idx + 2], ) ], ) # 10 is the max_batch_size for sum_model # Make sure one batch is only half-full for idx in range(10 * 2 + 2) ], [ InferenceRequest( id="large-request", inputs=[ # 10 is the max batch size, so we send a minibatch with # 20 entries
def inference_request() -> InferenceRequest: payload_path = os.path.join(TESTDATA_PATH, "inference-request.json") return InferenceRequest.parse_file(payload_path)
def sklearn_inference_request(inference_request: InferenceRequest) -> InferenceRequest: # Keep only a single input inference_request.inputs = inference_request.inputs[:1] return inference_request