def test_ml_server_dataframe_to_dict_and_back(sensors_str, use_test_project_tags): """ Tests the flow of the server creating a dataframe from the model's data, putting into a dict of string to df. lists of values, and the client being able to reconstruct it back to the original dataframe (less the second level names) """ # Run test with test project tag names if use_test_project_tags: tags = sensors_str # Run project with random names else: tags = [string.ascii_uppercase[i] for i in range(len(sensors_str))] # Some synthetic data original_input = np.random.random((10, len(tags))) model_output = np.random.random((10, len(tags))) # Convert this data into a dataframe with multi index columns df = model_utils.make_base_dataframe(tags, original_input, model_output) # Server then converts this into a dict which maps top level names to lists serialized = server_utils.dataframe_to_dict(df) # Client reproduces this dataframe df_clone = server_utils.dataframe_from_dict(serialized) # each subset of column under the top level names should be equal top_lvl_names = df.columns.get_level_values(0) for top_lvl_name in filter(lambda n: n not in ("start", "end"), top_lvl_names): assert np.allclose(df[top_lvl_name].values, df_clone[top_lvl_name].values)
def _create_anomaly_response(self, start_time: float = None): """ Use the current ``X`` and ``y`` to create an anomaly specific response using the trained ML model's ``.anomaly()`` method. Parameters ---------- start_time: Optional[float] Start time to use when timing the processing time of the request, will construct a new one if not provided. Returns ------- flask.Response The formatted anomaly representation response object. """ if start_time is None: start_time = timeit.default_timer() # To use this endpoint, we need a 'y' to calculate the errors. if g.y is None: message = { "message": "Cannot perform anomaly without 'y' to compare against." } return make_response((jsonify(message), 400)) # Now create an anomaly dataframe from the base response dataframe try: anomaly_df = g.model.anomaly(g.X, g.y, frequency=self.frequency) except AttributeError: msg = { "message": f"Model is not an AnomalyDetector, it is of type: {type(g.model)}" } return make_response(jsonify(msg), 422) # 422 Unprocessable Entity if request.args.get("all_columns") is None: columns_for_delete = [] for column in anomaly_df: if column[0] in DELETED_FROM_RESPONSE_COLUMNS: columns_for_delete.append(column) anomaly_df = anomaly_df.drop(columns=columns_for_delete) if request.args.get("format") == "parquet": return send_file( io.BytesIO(utils.dataframe_into_parquet_bytes(anomaly_df)), mimetype="application/octet-stream", ) else: context: typing.Dict[typing.Any, typing.Any] = dict() context["data"] = utils.dataframe_to_dict(anomaly_df) context[ "time-seconds"] = f"{timeit.default_timer() - start_time:.4f}" return make_response(jsonify(context), context.pop("status-code", 200))
def test_dataframe_from_to_dict(df): """ Test (de)serializations back and forth between dataframe -> dict -> dataframe """ index_was_datetimes: bool = isinstance(df.index, pd.DatetimeIndex) cloned = server_utils.dataframe_from_dict( server_utils.dataframe_to_dict(df)) if index_was_datetimes: # Ensure the function hasn't mutated the index. assert isinstance(df.index, pd.DatetimeIndex) assert np.allclose(df.values, cloned.values) assert df.columns.tolist() == cloned.columns.tolist() assert df.index.tolist() == cloned.index.tolist()
def test_dataframe_from_dict_ordering(index): """ We expect that from_dict should order based on the index, and will parse the index either as datetime or integers and sort in ascending order from there. """ df = pd.DataFrame(np.random.random((10, 5))) df.index = index original = df.copy() # What we want if isinstance(original.index[0], str): # Parse as datetime or integers if index is string try: original.index = original.index.map(dateutil.parser.isoparse) except ValueError: original.index = original.index.map(int) original.sort_index(inplace=True) # What we get df_out = server_utils.dataframe_from_dict( server_utils.dataframe_to_dict(df)) assert np.alltrue(df_out.index == original.index) assert np.alltrue(df_out.values == original.values)
def _send_prediction_request( self, X: pd.DataFrame, y: typing.Optional[pd.DataFrame], chunk: slice, machine: Machine, start: datetime, end: datetime, revision: str, ): """ Post a slice of data to the machine Parameters ---------- X: pandas.core.DataFrame The data for the model, in pandas representation chunk: slice The slice to take from DataFrame.iloc for the batch size machine: Machine start: datetime end: datetime Notes ----- PredictionResult.predictions may be None if the prediction process fails Returns ------- PredictionResult Raises ----- ResourceGone If the sever returns a 410, most likely because the revision is too old """ kwargs: Dict[str, Any] = dict( url= f"{self.base_url}/gordo/v0/{self.project_name}/{machine.name}{self.prediction_path}", params={ "format": self.format, "revision": revision }, ) # We're going to serialize the data as either JSON or Arrow if self.use_parquet: kwargs["files"] = { "X": server_utils.dataframe_into_parquet_bytes(X.iloc[chunk]), "y": server_utils.dataframe_into_parquet_bytes(y.iloc[chunk]) if y is not None else None, } else: kwargs["json"] = { "X": server_utils.dataframe_to_dict(X.iloc[chunk]), "y": server_utils.dataframe_to_dict(y.iloc[chunk]) if y is not None else None, } # Start attempting to get predictions for this batch for current_attempt in itertools.count(start=1): try: try: resp = _handle_response(self.session.post(**kwargs)) except HttpUnprocessableEntity: self.prediction_path = "/prediction" kwargs[ "url"] = f"{self.base_url}/gordo/v0/{self.project_name}/{machine.name}{self.prediction_path}" resp = _handle_response(self.session.post(**kwargs)) # If it was an IO or TimeoutError, we can retry except ( IOError, TimeoutError, requests.ConnectionError, requests.HTTPError, ) as exc: if current_attempt <= self.n_retries: time_to_sleep = min(2**(current_attempt + 2), 300) logger.warning( f"Failed to get response on attempt {current_attempt} out of {self.n_retries} attempts." ) sleep(time_to_sleep) continue else: msg = ( f"Failed to get predictions for dates {start} -> {end} " f"for target: '{machine.name}' Error: {exc}") logger.error(msg) return PredictionResult(name=machine.name, predictions=None, error_messages=[msg]) # No point in retrying a BadGordoRequest except (BadGordoRequest, NotFound) as exc: msg = ( f"Failed with bad request or not found for dates {start} -> {end} " f"for target: '{machine.name}' Error: {exc}") logger.error(msg) return PredictionResult(name=machine.name, predictions=None, error_messages=[msg]) except ResourceGone: raise # Process response and return if no exception else: predictions = self.dataframe_from_response(resp) # Forward predictions to any other consumer if registered. if self.prediction_forwarder is not None: self.prediction_forwarder( # type: ignore predictions=predictions, machine=machine, metadata=self.metadata) return PredictionResult(name=machine.name, predictions=predictions, error_messages=[])
def post(self): """ Process a POST request by using provided user data A typical response might look like this .. code-block:: python { 'data': [ { 'end': ['2016-01-01T00:10:00+00:00'], 'model-output': [0.0005317790200933814, -0.0001525811239844188, 0.0008310950361192226, 0.0015755111817270517], 'original-input': [0.9135588550070414, 0.3472517774179448, 0.8994921857179736, 0.11982773108991263], 'start': ['2016-01-01T00:00:00+00:00'], }, ... ], 'tags': [ {'asset': None, 'name': 'tag-0'}, {'asset': None, 'name': 'tag-1'}, {'asset': None, 'name': 'tag-2'}, {'asset': None, 'name': 'tag-3'} ], 'time-seconds': '0.1937' } """ context: typing.Dict[typing.Any, typing.Any] = dict() X = g.X process_request_start_time_s = timeit.default_timer() try: output = model_io.get_model_output(model=g.model, X=X) except ValueError as err: tb = traceback.format_exc() logger.error( f"Failed to predict or transform; error: {err} - \nTraceback: {tb}" ) context["error"] = f"ValueError: {str(err)}" return make_response((jsonify(context), 400)) # Model may only be a transformer, probably an AttributeError, but catch all to avoid logging other # exceptions twice if it happens. except Exception as exc: tb = traceback.format_exc() logger.error( f"Failed to predict or transform; error: {exc} - \nTraceback: {tb}" ) context[ "error"] = "Something unexpected happened; check your input data" return make_response((jsonify(context), 400)) else: get_model_output_time_s = timeit.default_timer() logger.debug( f"Calculating model output took " f"{get_model_output_time_s-process_request_start_time_s} s") data = model_utils.make_base_dataframe( tags=self.tags, model_input=X.values if isinstance(X, pd.DataFrame) else X, model_output=output, target_tag_list=self.target_tags, index=X.index, ) if request.args.get("format") == "parquet": return send_file( io.BytesIO( server_utils.dataframe_into_parquet_bytes(data)), mimetype="application/octet-stream", ) else: context["data"] = server_utils.dataframe_to_dict(data) return make_response( (jsonify(context), context.pop("status-code", 200)))